ArchSid commited on
Commit
5782a40
·
verified ·
1 Parent(s): e7b71f4

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: CohereForAI/aya-expanse-8b
3
+ library_name: peft
4
+ license: other
5
+ tags:
6
+ - llama-factory
7
+ - lora
8
+ - generated_from_trainer
9
+ model-index:
10
+ - name: aya-expanse-8b
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # aya-expanse-8b
18
+
19
+ This model is a fine-tuned version of [CohereForAI/aya-expanse-8b](https://huggingface.co/CohereForAI/aya-expanse-8b) on the combined_train_gemba dataset.
20
+
21
+ ## Model description
22
+
23
+ More information needed
24
+
25
+ ## Intended uses & limitations
26
+
27
+ More information needed
28
+
29
+ ## Training and evaluation data
30
+
31
+ More information needed
32
+
33
+ ## Training procedure
34
+
35
+ ### Training hyperparameters
36
+
37
+ The following hyperparameters were used during training:
38
+ - learning_rate: 5e-05
39
+ - train_batch_size: 2
40
+ - eval_batch_size: 8
41
+ - seed: 42
42
+ - gradient_accumulation_steps: 2
43
+ - total_train_batch_size: 4
44
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
45
+ - lr_scheduler_type: cosine
46
+ - num_epochs: 1.0
47
+ - mixed_precision_training: Native AMP
48
+
49
+ ### Training results
50
+
51
+
52
+
53
+ ### Framework versions
54
+
55
+ - PEFT 0.12.0
56
+ - Transformers 4.43.4
57
+ - Pytorch 2.4.0+cu121
58
+ - Datasets 2.20.0
59
+ - Tokenizers 0.19.1
adapter_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "CohereForAI/aya-expanse-8b",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 128,
14
+ "lora_dropout": 0.0,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 64,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "v_proj",
24
+ "q_proj"
25
+ ],
26
+ "task_type": "CAUSAL_LM",
27
+ "use_dora": false,
28
+ "use_rslora": false
29
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c66393ea3ef4f232d80b3017f2271aeb54dbf8bcec5703ac23c301f8e803dd96
3
+ size 109069176
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.999973032011003,
3
+ "total_flos": 7.616663384064e+17,
4
+ "train_loss": 0.3924344686242755,
5
+ "train_runtime": 20071.1732,
6
+ "train_samples_per_second": 3.695,
7
+ "train_steps_per_second": 0.924
8
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<BOS_TOKEN>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|END_OF_TURN_TOKEN|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<PAD>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c69a7ea6c0927dfac8c349186ebcf0466a4723c21cbdb2e850cf559f0bee92b8
3
+ size 12777433
tokenizer_config.json ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": false,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<PAD>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<UNK>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "<CLS>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "3": {
31
+ "content": "<SEP>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ },
38
+ "4": {
39
+ "content": "<MASK_TOKEN>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": true
45
+ },
46
+ "5": {
47
+ "content": "<BOS_TOKEN>",
48
+ "lstrip": false,
49
+ "normalized": false,
50
+ "rstrip": false,
51
+ "single_word": false,
52
+ "special": true
53
+ },
54
+ "6": {
55
+ "content": "<EOS_TOKEN>",
56
+ "lstrip": false,
57
+ "normalized": false,
58
+ "rstrip": false,
59
+ "single_word": false,
60
+ "special": true
61
+ },
62
+ "7": {
63
+ "content": "<EOP_TOKEN>",
64
+ "lstrip": false,
65
+ "normalized": false,
66
+ "rstrip": false,
67
+ "single_word": false,
68
+ "special": true
69
+ },
70
+ "255000": {
71
+ "content": "<|START_OF_TURN_TOKEN|>",
72
+ "lstrip": false,
73
+ "normalized": false,
74
+ "rstrip": false,
75
+ "single_word": false,
76
+ "special": false
77
+ },
78
+ "255001": {
79
+ "content": "<|END_OF_TURN_TOKEN|>",
80
+ "lstrip": false,
81
+ "normalized": false,
82
+ "rstrip": false,
83
+ "single_word": false,
84
+ "special": true
85
+ },
86
+ "255002": {
87
+ "content": "<|YES_TOKEN|>",
88
+ "lstrip": false,
89
+ "normalized": false,
90
+ "rstrip": false,
91
+ "single_word": false,
92
+ "special": false
93
+ },
94
+ "255003": {
95
+ "content": "<|NO_TOKEN|>",
96
+ "lstrip": false,
97
+ "normalized": false,
98
+ "rstrip": false,
99
+ "single_word": false,
100
+ "special": false
101
+ },
102
+ "255004": {
103
+ "content": "<|GOOD_TOKEN|>",
104
+ "lstrip": false,
105
+ "normalized": false,
106
+ "rstrip": false,
107
+ "single_word": false,
108
+ "special": false
109
+ },
110
+ "255005": {
111
+ "content": "<|BAD_TOKEN|>",
112
+ "lstrip": false,
113
+ "normalized": false,
114
+ "rstrip": false,
115
+ "single_word": false,
116
+ "special": false
117
+ },
118
+ "255006": {
119
+ "content": "<|USER_TOKEN|>",
120
+ "lstrip": false,
121
+ "normalized": false,
122
+ "rstrip": false,
123
+ "single_word": false,
124
+ "special": false
125
+ },
126
+ "255007": {
127
+ "content": "<|CHATBOT_TOKEN|>",
128
+ "lstrip": false,
129
+ "normalized": false,
130
+ "rstrip": false,
131
+ "single_word": false,
132
+ "special": false
133
+ },
134
+ "255008": {
135
+ "content": "<|SYSTEM_TOKEN|>",
136
+ "lstrip": false,
137
+ "normalized": false,
138
+ "rstrip": false,
139
+ "single_word": false,
140
+ "special": false
141
+ },
142
+ "255009": {
143
+ "content": "<|USER_0_TOKEN|>",
144
+ "lstrip": false,
145
+ "normalized": false,
146
+ "rstrip": false,
147
+ "single_word": false,
148
+ "special": false
149
+ },
150
+ "255010": {
151
+ "content": "<|USER_1_TOKEN|>",
152
+ "lstrip": false,
153
+ "normalized": false,
154
+ "rstrip": false,
155
+ "single_word": false,
156
+ "special": false
157
+ },
158
+ "255011": {
159
+ "content": "<|USER_2_TOKEN|>",
160
+ "lstrip": false,
161
+ "normalized": false,
162
+ "rstrip": false,
163
+ "single_word": false,
164
+ "special": false
165
+ },
166
+ "255012": {
167
+ "content": "<|USER_3_TOKEN|>",
168
+ "lstrip": false,
169
+ "normalized": false,
170
+ "rstrip": false,
171
+ "single_word": false,
172
+ "special": false
173
+ },
174
+ "255013": {
175
+ "content": "<|USER_4_TOKEN|>",
176
+ "lstrip": false,
177
+ "normalized": false,
178
+ "rstrip": false,
179
+ "single_word": false,
180
+ "special": false
181
+ },
182
+ "255014": {
183
+ "content": "<|USER_5_TOKEN|>",
184
+ "lstrip": false,
185
+ "normalized": false,
186
+ "rstrip": false,
187
+ "single_word": false,
188
+ "special": false
189
+ },
190
+ "255015": {
191
+ "content": "<|USER_6_TOKEN|>",
192
+ "lstrip": false,
193
+ "normalized": false,
194
+ "rstrip": false,
195
+ "single_word": false,
196
+ "special": false
197
+ },
198
+ "255016": {
199
+ "content": "<|USER_7_TOKEN|>",
200
+ "lstrip": false,
201
+ "normalized": false,
202
+ "rstrip": false,
203
+ "single_word": false,
204
+ "special": false
205
+ },
206
+ "255017": {
207
+ "content": "<|USER_8_TOKEN|>",
208
+ "lstrip": false,
209
+ "normalized": false,
210
+ "rstrip": false,
211
+ "single_word": false,
212
+ "special": false
213
+ },
214
+ "255018": {
215
+ "content": "<|USER_9_TOKEN|>",
216
+ "lstrip": false,
217
+ "normalized": false,
218
+ "rstrip": false,
219
+ "single_word": false,
220
+ "special": false
221
+ },
222
+ "255019": {
223
+ "content": "<|EXTRA_0_TOKEN|>",
224
+ "lstrip": false,
225
+ "normalized": false,
226
+ "rstrip": false,
227
+ "single_word": false,
228
+ "special": false
229
+ },
230
+ "255020": {
231
+ "content": "<|EXTRA_1_TOKEN|>",
232
+ "lstrip": false,
233
+ "normalized": false,
234
+ "rstrip": false,
235
+ "single_word": false,
236
+ "special": false
237
+ },
238
+ "255021": {
239
+ "content": "<|EXTRA_2_TOKEN|>",
240
+ "lstrip": false,
241
+ "normalized": false,
242
+ "rstrip": false,
243
+ "single_word": false,
244
+ "special": false
245
+ },
246
+ "255022": {
247
+ "content": "<|EXTRA_3_TOKEN|>",
248
+ "lstrip": false,
249
+ "normalized": false,
250
+ "rstrip": false,
251
+ "single_word": false,
252
+ "special": false
253
+ },
254
+ "255023": {
255
+ "content": "<|EXTRA_4_TOKEN|>",
256
+ "lstrip": false,
257
+ "normalized": false,
258
+ "rstrip": false,
259
+ "single_word": false,
260
+ "special": false
261
+ },
262
+ "255024": {
263
+ "content": "<|EXTRA_5_TOKEN|>",
264
+ "lstrip": false,
265
+ "normalized": false,
266
+ "rstrip": false,
267
+ "single_word": false,
268
+ "special": false
269
+ },
270
+ "255025": {
271
+ "content": "<|EXTRA_6_TOKEN|>",
272
+ "lstrip": false,
273
+ "normalized": false,
274
+ "rstrip": false,
275
+ "single_word": false,
276
+ "special": false
277
+ },
278
+ "255026": {
279
+ "content": "<|EXTRA_7_TOKEN|>",
280
+ "lstrip": false,
281
+ "normalized": false,
282
+ "rstrip": false,
283
+ "single_word": false,
284
+ "special": false
285
+ },
286
+ "255027": {
287
+ "content": "<|EXTRA_8_TOKEN|>",
288
+ "lstrip": false,
289
+ "normalized": false,
290
+ "rstrip": false,
291
+ "single_word": false,
292
+ "special": false
293
+ },
294
+ "255028": {
295
+ "content": "<|EXTRA_9_TOKEN|>",
296
+ "lstrip": false,
297
+ "normalized": false,
298
+ "rstrip": false,
299
+ "single_word": false,
300
+ "special": false
301
+ }
302
+ },
303
+ "bos_token": "<BOS_TOKEN>",
304
+ "chat_template": "{{ '<BOS_TOKEN>' }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% endif %}{% if system_message is defined %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% for message in loop_messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content + '<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ content + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}",
305
+ "clean_up_tokenization_spaces": false,
306
+ "eos_token": "<|END_OF_TURN_TOKEN|>",
307
+ "legacy": true,
308
+ "merges_file": null,
309
+ "model_max_length": 1000000000000000019884624838656,
310
+ "pad_token": "<PAD>",
311
+ "padding_side": "right",
312
+ "sp_model_kwargs": {},
313
+ "spaces_between_special_tokens": false,
314
+ "split_special_tokens": false,
315
+ "tokenizer_class": "CohereTokenizer",
316
+ "unk_token": null,
317
+ "use_default_system_prompt": false,
318
+ "vocab_file": null
319
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.999973032011003,
3
+ "total_flos": 7.616663384064e+17,
4
+ "train_loss": 0.3924344686242755,
5
+ "train_runtime": 20071.1732,
6
+ "train_samples_per_second": 3.695,
7
+ "train_steps_per_second": 0.924
8
+ }
trainer_log.jsonl ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"current_steps": 500, "total_steps": 18540, "loss": 0.6808, "learning_rate": 4.9910325182530915e-05, "epoch": 0.026967988997060488, "percentage": 2.7, "elapsed_time": "0:09:28", "remaining_time": "5:42:03", "throughput": "0.00", "total_tokens": 0}
2
+ {"current_steps": 1000, "total_steps": 18540, "loss": 0.458, "learning_rate": 4.9641944055954695e-05, "epoch": 0.053935977994120976, "percentage": 5.39, "elapsed_time": "0:09:39", "remaining_time": "2:49:17", "throughput": "0.00", "total_tokens": 0}
3
+ {"current_steps": 1500, "total_steps": 18540, "loss": 0.4483, "learning_rate": 4.9196781982554374e-05, "epoch": 0.08090396699118146, "percentage": 8.09, "elapsed_time": "0:19:30", "remaining_time": "3:41:32", "throughput": "0.00", "total_tokens": 0}
4
+ {"current_steps": 2000, "total_steps": 18540, "loss": 0.45, "learning_rate": 4.857803254854406e-05, "epoch": 0.10787195598824195, "percentage": 10.79, "elapsed_time": "0:09:27", "remaining_time": "1:18:16", "throughput": "0.00", "total_tokens": 0}
5
+ {"current_steps": 2500, "total_steps": 18540, "loss": 0.4444, "learning_rate": 4.7790134653328074e-05, "epoch": 0.13483994498530244, "percentage": 13.48, "elapsed_time": "0:19:13", "remaining_time": "2:03:17", "throughput": "0.00", "total_tokens": 0}
6
+ {"current_steps": 3000, "total_steps": 18540, "loss": 0.438, "learning_rate": 4.6838740664901435e-05, "epoch": 0.16180793398236293, "percentage": 16.18, "elapsed_time": "0:29:06", "remaining_time": "2:30:46", "throughput": "0.00", "total_tokens": 0}
7
+ {"current_steps": 3500, "total_steps": 18540, "loss": 0.4406, "learning_rate": 4.573067586984441e-05, "epoch": 0.18877592297942342, "percentage": 18.88, "elapsed_time": "0:38:59", "remaining_time": "2:47:33", "throughput": "0.00", "total_tokens": 0}
8
+ {"current_steps": 4000, "total_steps": 18540, "loss": 0.4293, "learning_rate": 4.447388950881625e-05, "epoch": 0.2157439119764839, "percentage": 21.57, "elapsed_time": "0:48:45", "remaining_time": "2:57:15", "throughput": "0.00", "total_tokens": 0}
9
+ {"current_steps": 4500, "total_steps": 18540, "loss": 0.4347, "learning_rate": 4.307739774881878e-05, "epoch": 0.2427119009735444, "percentage": 24.27, "elapsed_time": "0:58:38", "remaining_time": "3:02:58", "throughput": "0.00", "total_tokens": 0}
10
+ {"current_steps": 5000, "total_steps": 18540, "loss": 0.4303, "learning_rate": 4.1551219001346e-05, "epoch": 0.2696798899706049, "percentage": 26.97, "elapsed_time": "1:08:38", "remaining_time": "3:05:54", "throughput": "0.00", "total_tokens": 0}
11
+ {"current_steps": 5500, "total_steps": 18540, "loss": 0.4335, "learning_rate": 3.990630205044629e-05, "epoch": 0.29664787896766537, "percentage": 29.67, "elapsed_time": "1:18:27", "remaining_time": "3:06:01", "throughput": "0.00", "total_tokens": 0}
12
+ {"current_steps": 6000, "total_steps": 18540, "loss": 0.4307, "learning_rate": 3.815804970461473e-05, "epoch": 0.32361586796472586, "percentage": 32.36, "elapsed_time": "1:28:17", "remaining_time": "3:04:32", "throughput": "0.00", "total_tokens": 0}
13
+ {"current_steps": 6500, "total_steps": 18540, "loss": 0.433, "learning_rate": 3.6312001077632294e-05, "epoch": 0.35058385696178634, "percentage": 35.06, "elapsed_time": "1:38:02", "remaining_time": "3:01:36", "throughput": "0.00", "total_tokens": 0}
14
+ {"current_steps": 7000, "total_steps": 18540, "loss": 0.4305, "learning_rate": 3.438480032010211e-05, "epoch": 0.37755184595884683, "percentage": 37.76, "elapsed_time": "1:47:59", "remaining_time": "2:58:02", "throughput": "0.00", "total_tokens": 0}
15
+ {"current_steps": 7500, "total_steps": 18540, "loss": 0.4292, "learning_rate": 3.2390273142116814e-05, "epoch": 0.4045198349559073, "percentage": 40.45, "elapsed_time": "1:57:58", "remaining_time": "2:53:39", "throughput": "0.00", "total_tokens": 0}
16
+ {"current_steps": 8000, "total_steps": 18540, "loss": 0.4271, "learning_rate": 3.034272825252622e-05, "epoch": 0.4314878239529678, "percentage": 43.15, "elapsed_time": "2:07:52", "remaining_time": "2:48:28", "throughput": "0.00", "total_tokens": 0}
17
+ {"current_steps": 8500, "total_steps": 18540, "loss": 0.4265, "learning_rate": 2.8256854708469055e-05, "epoch": 0.4584558129500283, "percentage": 45.85, "elapsed_time": "2:17:45", "remaining_time": "2:42:42", "throughput": "0.00", "total_tokens": 0}
18
+ {"current_steps": 9000, "total_steps": 18540, "loss": 0.425, "learning_rate": 2.6147616536291464e-05, "epoch": 0.4854238019470888, "percentage": 48.54, "elapsed_time": "2:27:29", "remaining_time": "2:36:20", "throughput": "0.00", "total_tokens": 0}
19
+ {"current_steps": 9500, "total_steps": 18540, "loss": 0.4263, "learning_rate": 2.4030145379840563e-05, "epoch": 0.5123917909441493, "percentage": 51.24, "elapsed_time": "2:37:21", "remaining_time": "2:29:44", "throughput": "0.00", "total_tokens": 0}
20
+ {"current_steps": 10000, "total_steps": 18540, "loss": 0.4225, "learning_rate": 2.1919631946272402e-05, "epoch": 0.5393597799412098, "percentage": 53.94, "elapsed_time": "2:47:09", "remaining_time": "2:22:45", "throughput": "0.00", "total_tokens": 0}
21
+ {"current_steps": 10500, "total_steps": 18540, "loss": 0.4264, "learning_rate": 1.9831217028140688e-05, "epoch": 0.5663277689382703, "percentage": 56.63, "elapsed_time": "2:57:05", "remaining_time": "2:15:35", "throughput": "0.00", "total_tokens": 0}
22
+ {"current_steps": 11000, "total_steps": 18540, "loss": 0.4242, "learning_rate": 1.777988288357209e-05, "epoch": 0.5932957579353307, "percentage": 59.33, "elapsed_time": "3:06:56", "remaining_time": "2:08:08", "throughput": "0.00", "total_tokens": 0}
23
+ {"current_steps": 11500, "total_steps": 18540, "loss": 0.4223, "learning_rate": 1.578034575376518e-05, "epoch": 0.6202637469323913, "percentage": 62.03, "elapsed_time": "3:16:41", "remaining_time": "2:00:24", "throughput": "0.00", "total_tokens": 0}
24
+ {"current_steps": 12000, "total_steps": 18540, "loss": 0.4209, "learning_rate": 1.3850741762328944e-05, "epoch": 0.6472317359294517, "percentage": 64.72, "elapsed_time": "3:26:36", "remaining_time": "1:52:35", "throughput": "0.00", "total_tokens": 0}
25
+ {"current_steps": 12500, "total_steps": 18540, "loss": 0.4232, "learning_rate": 1.1997184612520374e-05, "epoch": 0.6741997249265123, "percentage": 67.42, "elapsed_time": "3:36:28", "remaining_time": "1:44:35", "throughput": "0.00", "total_tokens": 0}
26
+ {"current_steps": 13000, "total_steps": 18540, "loss": 0.4229, "learning_rate": 1.0236909470428333e-05, "epoch": 0.7011677139235727, "percentage": 70.12, "elapsed_time": "3:46:13", "remaining_time": "1:36:24", "throughput": "0.00", "total_tokens": 0}
27
+ {"current_steps": 13500, "total_steps": 18540, "loss": 0.4187, "learning_rate": 8.585739531996178e-06, "epoch": 0.7281357029206332, "percentage": 72.82, "elapsed_time": "3:56:01", "remaining_time": "1:28:07", "throughput": "0.00", "total_tokens": 0}
28
+ {"current_steps": 14000, "total_steps": 18540, "loss": 0.4244, "learning_rate": 7.048906317823642e-06, "epoch": 0.7551036919176937, "percentage": 75.51, "elapsed_time": "4:05:52", "remaining_time": "1:19:44", "throughput": "0.00", "total_tokens": 0}
29
+ {"current_steps": 14500, "total_steps": 18540, "loss": 0.4178, "learning_rate": 5.640853987596667e-06, "epoch": 0.7820716809147542, "percentage": 78.21, "elapsed_time": "4:15:43", "remaining_time": "1:11:15", "throughput": "0.00", "total_tokens": 0}
30
+ {"current_steps": 15000, "total_steps": 18540, "loss": 0.4195, "learning_rate": 4.371683888171277e-06, "epoch": 0.8090396699118146, "percentage": 80.91, "elapsed_time": "4:25:27", "remaining_time": "1:02:38", "throughput": "0.00", "total_tokens": 0}
31
+ {"current_steps": 15500, "total_steps": 18540, "loss": 0.4203, "learning_rate": 3.250501027307715e-06, "epoch": 0.8360076589088752, "percentage": 83.6, "elapsed_time": "4:35:07", "remaining_time": "0:53:57", "throughput": "0.00", "total_tokens": 0}
32
+ {"current_steps": 16000, "total_steps": 18540, "loss": 0.4187, "learning_rate": 2.287118546736572e-06, "epoch": 0.8629756479059356, "percentage": 86.3, "elapsed_time": "4:44:44", "remaining_time": "0:45:12", "throughput": "0.00", "total_tokens": 0}
33
+ {"current_steps": 16500, "total_steps": 18540, "loss": 0.4202, "learning_rate": 1.4845888005343062e-06, "epoch": 0.8899436369029962, "percentage": 89.0, "elapsed_time": "4:54:30", "remaining_time": "0:36:24", "throughput": "0.00", "total_tokens": 0}
34
+ {"current_steps": 17000, "total_steps": 18540, "loss": 0.4221, "learning_rate": 8.507582708938533e-07, "epoch": 0.9169116259000566, "percentage": 91.69, "elapsed_time": "5:04:13", "remaining_time": "0:27:33", "throughput": "0.00", "total_tokens": 0}
35
+ {"current_steps": 17500, "total_steps": 18540, "loss": 0.4217, "learning_rate": 3.901740487793598e-07, "epoch": 0.9438796148971171, "percentage": 94.39, "elapsed_time": "5:14:03", "remaining_time": "0:18:39", "throughput": "0.00", "total_tokens": 0}
36
+ {"current_steps": 18000, "total_steps": 18540, "loss": 0.4209, "learning_rate": 1.0614035867460847e-07, "epoch": 0.9708476038941776, "percentage": 97.09, "elapsed_time": "5:23:49", "remaining_time": "0:09:42", "throughput": "0.00", "total_tokens": 0}
37
+ {"current_steps": 18500, "total_steps": 18540, "loss": 0.4205, "learning_rate": 6.94854124816402e-10, "epoch": 0.9978155928912381, "percentage": 99.78, "elapsed_time": "5:33:37", "remaining_time": "0:00:43", "throughput": "0.00", "total_tokens": 0}
38
+ {"current_steps": 18540, "total_steps": 18540, "epoch": 0.999973032011003, "percentage": 100.0, "elapsed_time": "5:34:31", "remaining_time": "0:00:00", "throughput": "0.00", "total_tokens": 0}
trainer_state.json ADDED
@@ -0,0 +1,301 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.999973032011003,
5
+ "eval_steps": 500,
6
+ "global_step": 18540,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.026967988997060488,
13
+ "grad_norm": 0.8829220533370972,
14
+ "learning_rate": 4.9910325182530915e-05,
15
+ "loss": 0.6808,
16
+ "step": 500
17
+ },
18
+ {
19
+ "epoch": 0.053935977994120976,
20
+ "grad_norm": 1.0129338502883911,
21
+ "learning_rate": 4.9641944055954695e-05,
22
+ "loss": 0.458,
23
+ "step": 1000
24
+ },
25
+ {
26
+ "epoch": 0.08090396699118146,
27
+ "grad_norm": 0.8441129326820374,
28
+ "learning_rate": 4.9196781982554374e-05,
29
+ "loss": 0.4483,
30
+ "step": 1500
31
+ },
32
+ {
33
+ "epoch": 0.10787195598824195,
34
+ "grad_norm": 0.3246585726737976,
35
+ "learning_rate": 4.857803254854406e-05,
36
+ "loss": 0.45,
37
+ "step": 2000
38
+ },
39
+ {
40
+ "epoch": 0.13483994498530244,
41
+ "grad_norm": 0.44776633381843567,
42
+ "learning_rate": 4.7790134653328074e-05,
43
+ "loss": 0.4444,
44
+ "step": 2500
45
+ },
46
+ {
47
+ "epoch": 0.16180793398236293,
48
+ "grad_norm": 0.49329063296318054,
49
+ "learning_rate": 4.6838740664901435e-05,
50
+ "loss": 0.438,
51
+ "step": 3000
52
+ },
53
+ {
54
+ "epoch": 0.18877592297942342,
55
+ "grad_norm": 0.47724494338035583,
56
+ "learning_rate": 4.573067586984441e-05,
57
+ "loss": 0.4406,
58
+ "step": 3500
59
+ },
60
+ {
61
+ "epoch": 0.2157439119764839,
62
+ "grad_norm": 0.5984110236167908,
63
+ "learning_rate": 4.447388950881625e-05,
64
+ "loss": 0.4293,
65
+ "step": 4000
66
+ },
67
+ {
68
+ "epoch": 0.2427119009735444,
69
+ "grad_norm": 0.29806196689605713,
70
+ "learning_rate": 4.307739774881878e-05,
71
+ "loss": 0.4347,
72
+ "step": 4500
73
+ },
74
+ {
75
+ "epoch": 0.2696798899706049,
76
+ "grad_norm": 0.5836197137832642,
77
+ "learning_rate": 4.1551219001346e-05,
78
+ "loss": 0.4303,
79
+ "step": 5000
80
+ },
81
+ {
82
+ "epoch": 0.29664787896766537,
83
+ "grad_norm": 0.35174188017845154,
84
+ "learning_rate": 3.990630205044629e-05,
85
+ "loss": 0.4335,
86
+ "step": 5500
87
+ },
88
+ {
89
+ "epoch": 0.32361586796472586,
90
+ "grad_norm": 0.529833197593689,
91
+ "learning_rate": 3.815804970461473e-05,
92
+ "loss": 0.4307,
93
+ "step": 6000
94
+ },
95
+ {
96
+ "epoch": 0.35058385696178634,
97
+ "grad_norm": 0.2829456329345703,
98
+ "learning_rate": 3.6312001077632294e-05,
99
+ "loss": 0.433,
100
+ "step": 6500
101
+ },
102
+ {
103
+ "epoch": 0.37755184595884683,
104
+ "grad_norm": 0.3882625102996826,
105
+ "learning_rate": 3.438480032010211e-05,
106
+ "loss": 0.4305,
107
+ "step": 7000
108
+ },
109
+ {
110
+ "epoch": 0.4045198349559073,
111
+ "grad_norm": 0.4895012378692627,
112
+ "learning_rate": 3.2390273142116814e-05,
113
+ "loss": 0.4292,
114
+ "step": 7500
115
+ },
116
+ {
117
+ "epoch": 0.4314878239529678,
118
+ "grad_norm": 0.4984913468360901,
119
+ "learning_rate": 3.034272825252622e-05,
120
+ "loss": 0.4271,
121
+ "step": 8000
122
+ },
123
+ {
124
+ "epoch": 0.4584558129500283,
125
+ "grad_norm": 0.4299156367778778,
126
+ "learning_rate": 2.8256854708469055e-05,
127
+ "loss": 0.4265,
128
+ "step": 8500
129
+ },
130
+ {
131
+ "epoch": 0.4854238019470888,
132
+ "grad_norm": 0.44763749837875366,
133
+ "learning_rate": 2.6147616536291464e-05,
134
+ "loss": 0.425,
135
+ "step": 9000
136
+ },
137
+ {
138
+ "epoch": 0.5123917909441493,
139
+ "grad_norm": 0.6412510275840759,
140
+ "learning_rate": 2.4030145379840563e-05,
141
+ "loss": 0.4263,
142
+ "step": 9500
143
+ },
144
+ {
145
+ "epoch": 0.5393597799412098,
146
+ "grad_norm": 0.884003758430481,
147
+ "learning_rate": 2.1919631946272402e-05,
148
+ "loss": 0.4225,
149
+ "step": 10000
150
+ },
151
+ {
152
+ "epoch": 0.5663277689382703,
153
+ "grad_norm": 0.4416724443435669,
154
+ "learning_rate": 1.9831217028140688e-05,
155
+ "loss": 0.4264,
156
+ "step": 10500
157
+ },
158
+ {
159
+ "epoch": 0.5932957579353307,
160
+ "grad_norm": 0.5343862175941467,
161
+ "learning_rate": 1.777988288357209e-05,
162
+ "loss": 0.4242,
163
+ "step": 11000
164
+ },
165
+ {
166
+ "epoch": 0.6202637469323913,
167
+ "grad_norm": 0.5542997717857361,
168
+ "learning_rate": 1.578034575376518e-05,
169
+ "loss": 0.4223,
170
+ "step": 11500
171
+ },
172
+ {
173
+ "epoch": 0.6472317359294517,
174
+ "grad_norm": 0.8904162049293518,
175
+ "learning_rate": 1.3850741762328944e-05,
176
+ "loss": 0.4209,
177
+ "step": 12000
178
+ },
179
+ {
180
+ "epoch": 0.6741997249265123,
181
+ "grad_norm": 0.4040282368659973,
182
+ "learning_rate": 1.1997184612520374e-05,
183
+ "loss": 0.4232,
184
+ "step": 12500
185
+ },
186
+ {
187
+ "epoch": 0.7011677139235727,
188
+ "grad_norm": 0.5565162897109985,
189
+ "learning_rate": 1.0236909470428333e-05,
190
+ "loss": 0.4229,
191
+ "step": 13000
192
+ },
193
+ {
194
+ "epoch": 0.7281357029206332,
195
+ "grad_norm": 0.6723429560661316,
196
+ "learning_rate": 8.585739531996178e-06,
197
+ "loss": 0.4187,
198
+ "step": 13500
199
+ },
200
+ {
201
+ "epoch": 0.7551036919176937,
202
+ "grad_norm": 0.35649460554122925,
203
+ "learning_rate": 7.048906317823642e-06,
204
+ "loss": 0.4244,
205
+ "step": 14000
206
+ },
207
+ {
208
+ "epoch": 0.7820716809147542,
209
+ "grad_norm": 0.3733614683151245,
210
+ "learning_rate": 5.640853987596667e-06,
211
+ "loss": 0.4178,
212
+ "step": 14500
213
+ },
214
+ {
215
+ "epoch": 0.8090396699118146,
216
+ "grad_norm": 0.889401912689209,
217
+ "learning_rate": 4.371683888171277e-06,
218
+ "loss": 0.4195,
219
+ "step": 15000
220
+ },
221
+ {
222
+ "epoch": 0.8360076589088752,
223
+ "grad_norm": 0.4235946536064148,
224
+ "learning_rate": 3.250501027307715e-06,
225
+ "loss": 0.4203,
226
+ "step": 15500
227
+ },
228
+ {
229
+ "epoch": 0.8629756479059356,
230
+ "grad_norm": 0.5250910520553589,
231
+ "learning_rate": 2.287118546736572e-06,
232
+ "loss": 0.4187,
233
+ "step": 16000
234
+ },
235
+ {
236
+ "epoch": 0.8899436369029962,
237
+ "grad_norm": 0.5213350057601929,
238
+ "learning_rate": 1.4845888005343062e-06,
239
+ "loss": 0.4202,
240
+ "step": 16500
241
+ },
242
+ {
243
+ "epoch": 0.9169116259000566,
244
+ "grad_norm": 0.4989272356033325,
245
+ "learning_rate": 8.507582708938533e-07,
246
+ "loss": 0.4221,
247
+ "step": 17000
248
+ },
249
+ {
250
+ "epoch": 0.9438796148971171,
251
+ "grad_norm": 0.4094185531139374,
252
+ "learning_rate": 3.901740487793598e-07,
253
+ "loss": 0.4217,
254
+ "step": 17500
255
+ },
256
+ {
257
+ "epoch": 0.9708476038941776,
258
+ "grad_norm": 0.5710214972496033,
259
+ "learning_rate": 1.0614035867460847e-07,
260
+ "loss": 0.4209,
261
+ "step": 18000
262
+ },
263
+ {
264
+ "epoch": 0.9978155928912381,
265
+ "grad_norm": 0.5618287920951843,
266
+ "learning_rate": 6.94854124816402e-10,
267
+ "loss": 0.4205,
268
+ "step": 18500
269
+ },
270
+ {
271
+ "epoch": 0.999973032011003,
272
+ "step": 18540,
273
+ "total_flos": 7.616663384064e+17,
274
+ "train_loss": 0.3924344686242755,
275
+ "train_runtime": 20071.1732,
276
+ "train_samples_per_second": 3.695,
277
+ "train_steps_per_second": 0.924
278
+ }
279
+ ],
280
+ "logging_steps": 500,
281
+ "max_steps": 18540,
282
+ "num_input_tokens_seen": 0,
283
+ "num_train_epochs": 1,
284
+ "save_steps": 500,
285
+ "stateful_callbacks": {
286
+ "TrainerControl": {
287
+ "args": {
288
+ "should_epoch_stop": false,
289
+ "should_evaluate": false,
290
+ "should_log": false,
291
+ "should_save": true,
292
+ "should_training_stop": true
293
+ },
294
+ "attributes": {}
295
+ }
296
+ },
297
+ "total_flos": 7.616663384064e+17,
298
+ "train_batch_size": 2,
299
+ "trial_name": null,
300
+ "trial_params": null
301
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f439b6f67a24a2bdf25a68b75e5fe1e8c471ef66c53f9032914a71e66a1621c
3
+ size 5560
training_loss.png ADDED