Phudish commited on
Commit
ba5a6d1
·
verified ·
1 Parent(s): 0d0fc32

Upload folder using huggingface_hub

Browse files
Files changed (6) hide show
  1. model.safetensors +1 -1
  2. optimizer.pt +1 -1
  3. rng_state.pth +1 -1
  4. scheduler.pt +1 -1
  5. trainer_state.json +122 -50
  6. training_args.bin +1 -1
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b1d6f42e69f174e188b7e547b5d177b72f981f7c3460bbe133b6aef2761f34bb
3
  size 990345064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e842f338dae7e1331dbacd985d4e787496d1560c060694c93eacbc48ed8c0ce4
3
  size 990345064
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c7d1e3e73f0a72c95d8422666bcf5f268cfc4e40bc365f0ce445faebec7474a2
3
  size 1980859973
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e7600b9e111b7fd503118faff8164f6cb27427040a486d370178030cc7e6a26
3
  size 1980859973
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7e31f6f5b302408a59340dd48ff29db720fa80e3be76edb3e9b7d412daa33838
3
  size 14575
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5746f5cd389dd09e26f845338f66196834e9c729ef345b5766421c338ecc3e71
3
  size 14575
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:624b66aa5eba448808744b6d328f3f238355960b2a710ef056623afebdb80957
3
  size 627
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:802a36e47f9883645ae832d2ec1f3a606362c67d9cfd7c21739f0b88084f2906
3
  size 627
trainer_state.json CHANGED
@@ -1,121 +1,193 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 3.0,
5
  "eval_steps": 500,
6
- "global_step": 5949,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.25,
13
- "grad_norm": 0.6601719260215759,
14
- "learning_rate": 1.8991427130610186e-05,
15
- "loss": 2.9266,
16
  "step": 500
17
  },
18
  {
19
  "epoch": 0.5,
20
- "grad_norm": 0.38926956057548523,
21
- "learning_rate": 1.7982854261220374e-05,
22
- "loss": 0.6793,
23
  "step": 1000
24
  },
25
  {
26
  "epoch": 0.76,
27
- "grad_norm": 0.44040098786354065,
28
- "learning_rate": 1.697428139183056e-05,
29
- "loss": 0.5527,
30
  "step": 1500
31
  },
32
  {
33
  "epoch": 1.0,
34
- "eval_loss": 0.40914833545684814,
35
- "eval_runtime": 1.2975,
36
- "eval_samples_per_second": 154.14,
37
- "eval_steps_per_second": 3.083,
38
  "step": 1983
39
  },
40
  {
41
  "epoch": 1.01,
42
- "grad_norm": 0.33104407787323,
43
- "learning_rate": 1.5965708522440747e-05,
44
- "loss": 0.4971,
45
  "step": 2000
46
  },
47
  {
48
  "epoch": 1.26,
49
- "grad_norm": 0.2552054226398468,
50
- "learning_rate": 1.4957135653050934e-05,
51
- "loss": 0.4723,
52
  "step": 2500
53
  },
54
  {
55
  "epoch": 1.51,
56
- "grad_norm": 0.25980475544929504,
57
- "learning_rate": 1.3948562783661122e-05,
58
- "loss": 0.4527,
59
  "step": 3000
60
  },
61
  {
62
  "epoch": 1.77,
63
- "grad_norm": 0.2463352233171463,
64
- "learning_rate": 1.2939989914271307e-05,
65
- "loss": 0.4385,
66
  "step": 3500
67
  },
68
  {
69
  "epoch": 2.0,
70
- "eval_loss": 0.367231547832489,
71
- "eval_runtime": 1.4969,
72
- "eval_samples_per_second": 133.61,
73
- "eval_steps_per_second": 2.672,
74
  "step": 3966
75
  },
76
  {
77
  "epoch": 2.02,
78
- "grad_norm": 0.21796859800815582,
79
- "learning_rate": 1.1931417044881495e-05,
80
- "loss": 0.4273,
81
  "step": 4000
82
  },
83
  {
84
  "epoch": 2.27,
85
- "grad_norm": 0.23424555361270905,
86
- "learning_rate": 1.0922844175491681e-05,
87
- "loss": 0.4175,
88
  "step": 4500
89
  },
90
  {
91
  "epoch": 2.52,
92
- "grad_norm": 0.1995495855808258,
93
- "learning_rate": 9.914271306101868e-06,
94
- "loss": 0.4082,
95
  "step": 5000
96
  },
97
  {
98
  "epoch": 2.77,
99
- "grad_norm": 0.2701597511768341,
100
- "learning_rate": 8.905698436712054e-06,
101
- "loss": 0.4057,
102
  "step": 5500
103
  },
104
  {
105
  "epoch": 3.0,
106
- "eval_loss": 0.349164217710495,
107
- "eval_runtime": 1.4817,
108
- "eval_samples_per_second": 134.978,
109
- "eval_steps_per_second": 2.7,
110
  "step": 5949
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  }
112
  ],
113
  "logging_steps": 500,
114
- "max_steps": 9915,
115
  "num_input_tokens_seen": 0,
116
- "num_train_epochs": 5,
117
  "save_steps": 500,
118
- "total_flos": 2.3041896601603277e+17,
119
  "train_batch_size": 64,
120
  "trial_name": null,
121
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 5.0,
5
  "eval_steps": 500,
6
+ "global_step": 9915,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.25,
13
+ "grad_norm": 1.1657905578613281,
14
+ "learning_rate": 1.9495713565305096e-05,
15
+ "loss": 2.9222,
16
  "step": 500
17
  },
18
  {
19
  "epoch": 0.5,
20
+ "grad_norm": 0.34992507100105286,
21
+ "learning_rate": 1.8991427130610186e-05,
22
+ "loss": 0.6675,
23
  "step": 1000
24
  },
25
  {
26
  "epoch": 0.76,
27
+ "grad_norm": 0.4056757688522339,
28
+ "learning_rate": 1.8487140695915284e-05,
29
+ "loss": 0.5431,
30
  "step": 1500
31
  },
32
  {
33
  "epoch": 1.0,
34
+ "eval_loss": 0.40460753440856934,
35
+ "eval_runtime": 1.4605,
36
+ "eval_samples_per_second": 136.937,
37
+ "eval_steps_per_second": 2.739,
38
  "step": 1983
39
  },
40
  {
41
  "epoch": 1.01,
42
+ "grad_norm": 0.3280782103538513,
43
+ "learning_rate": 1.7982854261220374e-05,
44
+ "loss": 0.4894,
45
  "step": 2000
46
  },
47
  {
48
  "epoch": 1.26,
49
+ "grad_norm": 0.2726248800754547,
50
+ "learning_rate": 1.747856782652547e-05,
51
+ "loss": 0.4646,
52
  "step": 2500
53
  },
54
  {
55
  "epoch": 1.51,
56
+ "grad_norm": 0.24760539829730988,
57
+ "learning_rate": 1.697428139183056e-05,
58
+ "loss": 0.4445,
59
  "step": 3000
60
  },
61
  {
62
  "epoch": 1.77,
63
+ "grad_norm": 0.2567976415157318,
64
+ "learning_rate": 1.6469994957135657e-05,
65
+ "loss": 0.4297,
66
  "step": 3500
67
  },
68
  {
69
  "epoch": 2.0,
70
+ "eval_loss": 0.3594338595867157,
71
+ "eval_runtime": 1.4706,
72
+ "eval_samples_per_second": 135.998,
73
+ "eval_steps_per_second": 2.72,
74
  "step": 3966
75
  },
76
  {
77
  "epoch": 2.02,
78
+ "grad_norm": 0.21329322457313538,
79
+ "learning_rate": 1.5965708522440747e-05,
80
+ "loss": 0.4174,
81
  "step": 4000
82
  },
83
  {
84
  "epoch": 2.27,
85
+ "grad_norm": 0.22651061415672302,
86
+ "learning_rate": 1.546142208774584e-05,
87
+ "loss": 0.4065,
88
  "step": 4500
89
  },
90
  {
91
  "epoch": 2.52,
92
+ "grad_norm": 0.1898190826177597,
93
+ "learning_rate": 1.4957135653050934e-05,
94
+ "loss": 0.3963,
95
  "step": 5000
96
  },
97
  {
98
  "epoch": 2.77,
99
+ "grad_norm": 0.2700614631175995,
100
+ "learning_rate": 1.4452849218356026e-05,
101
+ "loss": 0.3927,
102
  "step": 5500
103
  },
104
  {
105
  "epoch": 3.0,
106
+ "eval_loss": 0.33957552909851074,
107
+ "eval_runtime": 1.4339,
108
+ "eval_samples_per_second": 139.484,
109
+ "eval_steps_per_second": 2.79,
110
  "step": 5949
111
+ },
112
+ {
113
+ "epoch": 3.03,
114
+ "grad_norm": 0.2275162637233734,
115
+ "learning_rate": 1.3948562783661122e-05,
116
+ "loss": 0.388,
117
+ "step": 6000
118
+ },
119
+ {
120
+ "epoch": 3.28,
121
+ "grad_norm": 0.2916988134384155,
122
+ "learning_rate": 1.3444276348966214e-05,
123
+ "loss": 0.3773,
124
+ "step": 6500
125
+ },
126
+ {
127
+ "epoch": 3.53,
128
+ "grad_norm": 0.25161975622177124,
129
+ "learning_rate": 1.2939989914271307e-05,
130
+ "loss": 0.3781,
131
+ "step": 7000
132
+ },
133
+ {
134
+ "epoch": 3.78,
135
+ "grad_norm": 0.3184010982513428,
136
+ "learning_rate": 1.2435703479576399e-05,
137
+ "loss": 0.3695,
138
+ "step": 7500
139
+ },
140
+ {
141
+ "epoch": 4.0,
142
+ "eval_loss": 0.3304264545440674,
143
+ "eval_runtime": 1.4505,
144
+ "eval_samples_per_second": 137.885,
145
+ "eval_steps_per_second": 2.758,
146
+ "step": 7932
147
+ },
148
+ {
149
+ "epoch": 4.03,
150
+ "grad_norm": 0.2558070123195648,
151
+ "learning_rate": 1.1931417044881495e-05,
152
+ "loss": 0.3671,
153
+ "step": 8000
154
+ },
155
+ {
156
+ "epoch": 4.29,
157
+ "grad_norm": 0.2560890018939972,
158
+ "learning_rate": 1.1427130610186587e-05,
159
+ "loss": 0.3573,
160
+ "step": 8500
161
+ },
162
+ {
163
+ "epoch": 4.54,
164
+ "grad_norm": 0.242618128657341,
165
+ "learning_rate": 1.0922844175491681e-05,
166
+ "loss": 0.3597,
167
+ "step": 9000
168
+ },
169
+ {
170
+ "epoch": 4.79,
171
+ "grad_norm": 0.20291948318481445,
172
+ "learning_rate": 1.0418557740796773e-05,
173
+ "loss": 0.3575,
174
+ "step": 9500
175
+ },
176
+ {
177
+ "epoch": 5.0,
178
+ "eval_loss": 0.32436466217041016,
179
+ "eval_runtime": 1.4557,
180
+ "eval_samples_per_second": 137.394,
181
+ "eval_steps_per_second": 2.748,
182
+ "step": 9915
183
  }
184
  ],
185
  "logging_steps": 500,
186
+ "max_steps": 19830,
187
  "num_input_tokens_seen": 0,
188
+ "num_train_epochs": 10,
189
  "save_steps": 500,
190
+ "total_flos": 3.839198019902669e+17,
191
  "train_batch_size": 64,
192
  "trial_name": null,
193
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cbd57721092a12ad5c499cf07c4fb770ea0ece2ea47682a1c4c49c3ce1d5eba2
3
  size 4795
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c4524da51f7684ee67ccc8ad22c93ffcd26ae4356d3ffcda1d40bd60ec25d8f
3
  size 4795