Add files
Browse files- README.md +0 -0
- all_results.json +159 -0
- config.json +50 -0
- pytorch_model.bin +3 -0
- special_tokens_map.json +5 -0
- tokenizer.json +0 -0
- tokenizer_config.json +9 -0
- train_results.json +159 -0
- trainer_state.json +0 -0
- training_args.bin +3 -0
README.md
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
all_results.json
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"MSE": 0.0,
|
| 3 |
+
"MSE/layer0": 0.0,
|
| 4 |
+
"MSE/layer1": 0.0,
|
| 5 |
+
"MSE/layer10": 0.0,
|
| 6 |
+
"MSE/layer11": 0.0,
|
| 7 |
+
"MSE/layer12": 0.0,
|
| 8 |
+
"MSE/layer13": 0.0,
|
| 9 |
+
"MSE/layer14": 0.0,
|
| 10 |
+
"MSE/layer15": 0.0,
|
| 11 |
+
"MSE/layer16": 0.0,
|
| 12 |
+
"MSE/layer17": 0.0,
|
| 13 |
+
"MSE/layer18": 0.0,
|
| 14 |
+
"MSE/layer19": 0.0,
|
| 15 |
+
"MSE/layer2": 0.0,
|
| 16 |
+
"MSE/layer20": 0.0,
|
| 17 |
+
"MSE/layer21": 0.0,
|
| 18 |
+
"MSE/layer22": 0.0,
|
| 19 |
+
"MSE/layer23": 0.0,
|
| 20 |
+
"MSE/layer3": 0.0,
|
| 21 |
+
"MSE/layer4": 0.0,
|
| 22 |
+
"MSE/layer5": 0.0,
|
| 23 |
+
"MSE/layer6": 0.0,
|
| 24 |
+
"MSE/layer7": 0.0,
|
| 25 |
+
"MSE/layer8": 0.0,
|
| 26 |
+
"MSE/layer9": 0.0,
|
| 27 |
+
"dead_code_fraction": 1.0,
|
| 28 |
+
"dead_code_fraction/layer0": 1.0,
|
| 29 |
+
"dead_code_fraction/layer1": 1.0,
|
| 30 |
+
"dead_code_fraction/layer10": 1.0,
|
| 31 |
+
"dead_code_fraction/layer11": 1.0,
|
| 32 |
+
"dead_code_fraction/layer12": 1.0,
|
| 33 |
+
"dead_code_fraction/layer13": 1.0,
|
| 34 |
+
"dead_code_fraction/layer14": 1.0,
|
| 35 |
+
"dead_code_fraction/layer15": 1.0,
|
| 36 |
+
"dead_code_fraction/layer16": 1.0,
|
| 37 |
+
"dead_code_fraction/layer17": 1.0,
|
| 38 |
+
"dead_code_fraction/layer18": 1.0,
|
| 39 |
+
"dead_code_fraction/layer19": 1.0,
|
| 40 |
+
"dead_code_fraction/layer2": 1.0,
|
| 41 |
+
"dead_code_fraction/layer20": 1.0,
|
| 42 |
+
"dead_code_fraction/layer21": 1.0,
|
| 43 |
+
"dead_code_fraction/layer22": 1.0,
|
| 44 |
+
"dead_code_fraction/layer23": 1.0,
|
| 45 |
+
"dead_code_fraction/layer3": 1.0,
|
| 46 |
+
"dead_code_fraction/layer4": 1.0,
|
| 47 |
+
"dead_code_fraction/layer5": 1.0,
|
| 48 |
+
"dead_code_fraction/layer6": 1.0,
|
| 49 |
+
"dead_code_fraction/layer7": 1.0,
|
| 50 |
+
"dead_code_fraction/layer8": 1.0,
|
| 51 |
+
"dead_code_fraction/layer9": 1.0,
|
| 52 |
+
"epoch": 6.26,
|
| 53 |
+
"input_norm": 0.0,
|
| 54 |
+
"input_norm/layer0": 0.0,
|
| 55 |
+
"input_norm/layer1": 0.0,
|
| 56 |
+
"input_norm/layer10": 0.0,
|
| 57 |
+
"input_norm/layer11": 0.0,
|
| 58 |
+
"input_norm/layer12": 0.0,
|
| 59 |
+
"input_norm/layer13": 0.0,
|
| 60 |
+
"input_norm/layer14": 0.0,
|
| 61 |
+
"input_norm/layer15": 0.0,
|
| 62 |
+
"input_norm/layer16": 0.0,
|
| 63 |
+
"input_norm/layer17": 0.0,
|
| 64 |
+
"input_norm/layer18": 0.0,
|
| 65 |
+
"input_norm/layer19": 0.0,
|
| 66 |
+
"input_norm/layer2": 0.0,
|
| 67 |
+
"input_norm/layer20": 0.0,
|
| 68 |
+
"input_norm/layer21": 0.0,
|
| 69 |
+
"input_norm/layer22": 0.0,
|
| 70 |
+
"input_norm/layer23": 0.0,
|
| 71 |
+
"input_norm/layer3": 0.0,
|
| 72 |
+
"input_norm/layer4": 0.0,
|
| 73 |
+
"input_norm/layer5": 0.0,
|
| 74 |
+
"input_norm/layer6": 0.0,
|
| 75 |
+
"input_norm/layer7": 0.0,
|
| 76 |
+
"input_norm/layer8": 0.0,
|
| 77 |
+
"input_norm/layer9": 0.0,
|
| 78 |
+
"max_norm": 45.539119720458984,
|
| 79 |
+
"max_norm/layer0": 34.44173049926758,
|
| 80 |
+
"max_norm/layer1": 36.61558151245117,
|
| 81 |
+
"max_norm/layer10": 38.54380416870117,
|
| 82 |
+
"max_norm/layer11": 34.865203857421875,
|
| 83 |
+
"max_norm/layer12": 40.908504486083984,
|
| 84 |
+
"max_norm/layer13": 35.78108215332031,
|
| 85 |
+
"max_norm/layer14": 36.67228317260742,
|
| 86 |
+
"max_norm/layer15": 45.083438873291016,
|
| 87 |
+
"max_norm/layer16": 36.927913665771484,
|
| 88 |
+
"max_norm/layer17": 45.539119720458984,
|
| 89 |
+
"max_norm/layer18": 39.2352409362793,
|
| 90 |
+
"max_norm/layer19": 38.779598236083984,
|
| 91 |
+
"max_norm/layer2": 26.836795806884766,
|
| 92 |
+
"max_norm/layer20": 38.50577163696289,
|
| 93 |
+
"max_norm/layer21": 38.87571334838867,
|
| 94 |
+
"max_norm/layer22": 39.42427062988281,
|
| 95 |
+
"max_norm/layer23": 37.21847915649414,
|
| 96 |
+
"max_norm/layer3": 34.34575271606445,
|
| 97 |
+
"max_norm/layer4": 34.4432258605957,
|
| 98 |
+
"max_norm/layer5": 44.077754974365234,
|
| 99 |
+
"max_norm/layer6": 28.6057071685791,
|
| 100 |
+
"max_norm/layer7": 37.91745376586914,
|
| 101 |
+
"max_norm/layer8": 36.69032287597656,
|
| 102 |
+
"max_norm/layer9": 37.08796691894531,
|
| 103 |
+
"mean_norm": 11.799732064207396,
|
| 104 |
+
"mean_norm/layer0": 11.755437850952148,
|
| 105 |
+
"mean_norm/layer1": 11.22901839017868,
|
| 106 |
+
"mean_norm/layer10": 11.532833635807037,
|
| 107 |
+
"mean_norm/layer11": 11.962444841861725,
|
| 108 |
+
"mean_norm/layer12": 12.79077160358429,
|
| 109 |
+
"mean_norm/layer13": 11.57960969209671,
|
| 110 |
+
"mean_norm/layer14": 12.059264957904816,
|
| 111 |
+
"mean_norm/layer15": 12.540440499782562,
|
| 112 |
+
"mean_norm/layer16": 11.641206741333008,
|
| 113 |
+
"mean_norm/layer17": 12.231300234794617,
|
| 114 |
+
"mean_norm/layer18": 11.600049555301666,
|
| 115 |
+
"mean_norm/layer19": 11.686796128749847,
|
| 116 |
+
"mean_norm/layer2": 9.256644666194916,
|
| 117 |
+
"mean_norm/layer20": 11.78922188282013,
|
| 118 |
+
"mean_norm/layer21": 11.759462356567383,
|
| 119 |
+
"mean_norm/layer22": 13.063357532024384,
|
| 120 |
+
"mean_norm/layer23": 13.022553265094757,
|
| 121 |
+
"mean_norm/layer3": 12.574194192886353,
|
| 122 |
+
"mean_norm/layer4": 10.863756775856018,
|
| 123 |
+
"mean_norm/layer5": 14.197384178638458,
|
| 124 |
+
"mean_norm/layer6": 10.185243308544159,
|
| 125 |
+
"mean_norm/layer7": 10.893572747707367,
|
| 126 |
+
"mean_norm/layer8": 11.53871750831604,
|
| 127 |
+
"mean_norm/layer9": 11.440286993980408,
|
| 128 |
+
"multicode_k": 8,
|
| 129 |
+
"output_norm": 0.0,
|
| 130 |
+
"output_norm/layer0": 0.0,
|
| 131 |
+
"output_norm/layer1": 0.0,
|
| 132 |
+
"output_norm/layer10": 0.0,
|
| 133 |
+
"output_norm/layer11": 0.0,
|
| 134 |
+
"output_norm/layer12": 0.0,
|
| 135 |
+
"output_norm/layer13": 0.0,
|
| 136 |
+
"output_norm/layer14": 0.0,
|
| 137 |
+
"output_norm/layer15": 0.0,
|
| 138 |
+
"output_norm/layer16": 0.0,
|
| 139 |
+
"output_norm/layer17": 0.0,
|
| 140 |
+
"output_norm/layer18": 0.0,
|
| 141 |
+
"output_norm/layer19": 0.0,
|
| 142 |
+
"output_norm/layer2": 0.0,
|
| 143 |
+
"output_norm/layer20": 0.0,
|
| 144 |
+
"output_norm/layer21": 0.0,
|
| 145 |
+
"output_norm/layer22": 0.0,
|
| 146 |
+
"output_norm/layer23": 0.0,
|
| 147 |
+
"output_norm/layer3": 0.0,
|
| 148 |
+
"output_norm/layer4": 0.0,
|
| 149 |
+
"output_norm/layer5": 0.0,
|
| 150 |
+
"output_norm/layer6": 0.0,
|
| 151 |
+
"output_norm/layer7": 0.0,
|
| 152 |
+
"output_norm/layer8": 0.0,
|
| 153 |
+
"output_norm/layer9": 0.0,
|
| 154 |
+
"train_loss": 2.685329116312663,
|
| 155 |
+
"train_runtime": 43939.9354,
|
| 156 |
+
"train_samples": 114937,
|
| 157 |
+
"train_samples_per_second": 16.386,
|
| 158 |
+
"train_steps_per_second": 0.341
|
| 159 |
+
}
|
config.json
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"GPTNeoXCodebookModel"
|
| 4 |
+
],
|
| 5 |
+
"codebook_at": [
|
| 6 |
+
"attn_preproj"
|
| 7 |
+
],
|
| 8 |
+
"codebook_type": "group",
|
| 9 |
+
"k_codebook": 8,
|
| 10 |
+
"kmeans_init": false,
|
| 11 |
+
"kmeans_init_examples": 1000,
|
| 12 |
+
"kmeans_kwargs": {
|
| 13 |
+
"batch_size": 24576,
|
| 14 |
+
"n_init": "auto"
|
| 15 |
+
},
|
| 16 |
+
"kmeans_path": "/.cache/cb_volume/huggingface/kmeans_embeddings.pt",
|
| 17 |
+
"layers_to_snap": [
|
| 18 |
+
0,
|
| 19 |
+
1,
|
| 20 |
+
2,
|
| 21 |
+
3,
|
| 22 |
+
4,
|
| 23 |
+
5,
|
| 24 |
+
6,
|
| 25 |
+
7,
|
| 26 |
+
8,
|
| 27 |
+
9,
|
| 28 |
+
10,
|
| 29 |
+
11,
|
| 30 |
+
12,
|
| 31 |
+
13,
|
| 32 |
+
14,
|
| 33 |
+
15,
|
| 34 |
+
16,
|
| 35 |
+
17,
|
| 36 |
+
18,
|
| 37 |
+
19,
|
| 38 |
+
20,
|
| 39 |
+
21,
|
| 40 |
+
22,
|
| 41 |
+
23
|
| 42 |
+
],
|
| 43 |
+
"loss": "aeloss",
|
| 44 |
+
"model_type": "codebook",
|
| 45 |
+
"num_codebooks": 16,
|
| 46 |
+
"num_codes": 10000,
|
| 47 |
+
"similarity_metric": "inner_product",
|
| 48 |
+
"torch_dtype": "float32",
|
| 49 |
+
"transformers_version": "4.27.3"
|
| 50 |
+
}
|
pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:047349fb750aba188ba5b471a0c8e518b74984623d17524d02475169d29d4a3d
|
| 3 |
+
size 2705783745
|
special_tokens_map.json
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": "<|endoftext|>",
|
| 3 |
+
"eos_token": "<|endoftext|>",
|
| 4 |
+
"unk_token": "<|endoftext|>"
|
| 5 |
+
}
|
tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": false,
|
| 3 |
+
"bos_token": "<|endoftext|>",
|
| 4 |
+
"eos_token": "<|endoftext|>",
|
| 5 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 6 |
+
"special_tokens_map_file": "/admin/home-hailey/.cache/huggingface/hub/models--EleutherAI--gpt-neox-20b/snapshots/4e49eadb5d14bd22f314ec3f45b69a87b88c7691/special_tokens_map.json",
|
| 7 |
+
"tokenizer_class": "GPTNeoXTokenizer",
|
| 8 |
+
"unk_token": "<|endoftext|>"
|
| 9 |
+
}
|
train_results.json
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"MSE": 0.0,
|
| 3 |
+
"MSE/layer0": 0.0,
|
| 4 |
+
"MSE/layer1": 0.0,
|
| 5 |
+
"MSE/layer10": 0.0,
|
| 6 |
+
"MSE/layer11": 0.0,
|
| 7 |
+
"MSE/layer12": 0.0,
|
| 8 |
+
"MSE/layer13": 0.0,
|
| 9 |
+
"MSE/layer14": 0.0,
|
| 10 |
+
"MSE/layer15": 0.0,
|
| 11 |
+
"MSE/layer16": 0.0,
|
| 12 |
+
"MSE/layer17": 0.0,
|
| 13 |
+
"MSE/layer18": 0.0,
|
| 14 |
+
"MSE/layer19": 0.0,
|
| 15 |
+
"MSE/layer2": 0.0,
|
| 16 |
+
"MSE/layer20": 0.0,
|
| 17 |
+
"MSE/layer21": 0.0,
|
| 18 |
+
"MSE/layer22": 0.0,
|
| 19 |
+
"MSE/layer23": 0.0,
|
| 20 |
+
"MSE/layer3": 0.0,
|
| 21 |
+
"MSE/layer4": 0.0,
|
| 22 |
+
"MSE/layer5": 0.0,
|
| 23 |
+
"MSE/layer6": 0.0,
|
| 24 |
+
"MSE/layer7": 0.0,
|
| 25 |
+
"MSE/layer8": 0.0,
|
| 26 |
+
"MSE/layer9": 0.0,
|
| 27 |
+
"dead_code_fraction": 1.0,
|
| 28 |
+
"dead_code_fraction/layer0": 1.0,
|
| 29 |
+
"dead_code_fraction/layer1": 1.0,
|
| 30 |
+
"dead_code_fraction/layer10": 1.0,
|
| 31 |
+
"dead_code_fraction/layer11": 1.0,
|
| 32 |
+
"dead_code_fraction/layer12": 1.0,
|
| 33 |
+
"dead_code_fraction/layer13": 1.0,
|
| 34 |
+
"dead_code_fraction/layer14": 1.0,
|
| 35 |
+
"dead_code_fraction/layer15": 1.0,
|
| 36 |
+
"dead_code_fraction/layer16": 1.0,
|
| 37 |
+
"dead_code_fraction/layer17": 1.0,
|
| 38 |
+
"dead_code_fraction/layer18": 1.0,
|
| 39 |
+
"dead_code_fraction/layer19": 1.0,
|
| 40 |
+
"dead_code_fraction/layer2": 1.0,
|
| 41 |
+
"dead_code_fraction/layer20": 1.0,
|
| 42 |
+
"dead_code_fraction/layer21": 1.0,
|
| 43 |
+
"dead_code_fraction/layer22": 1.0,
|
| 44 |
+
"dead_code_fraction/layer23": 1.0,
|
| 45 |
+
"dead_code_fraction/layer3": 1.0,
|
| 46 |
+
"dead_code_fraction/layer4": 1.0,
|
| 47 |
+
"dead_code_fraction/layer5": 1.0,
|
| 48 |
+
"dead_code_fraction/layer6": 1.0,
|
| 49 |
+
"dead_code_fraction/layer7": 1.0,
|
| 50 |
+
"dead_code_fraction/layer8": 1.0,
|
| 51 |
+
"dead_code_fraction/layer9": 1.0,
|
| 52 |
+
"epoch": 6.26,
|
| 53 |
+
"input_norm": 0.0,
|
| 54 |
+
"input_norm/layer0": 0.0,
|
| 55 |
+
"input_norm/layer1": 0.0,
|
| 56 |
+
"input_norm/layer10": 0.0,
|
| 57 |
+
"input_norm/layer11": 0.0,
|
| 58 |
+
"input_norm/layer12": 0.0,
|
| 59 |
+
"input_norm/layer13": 0.0,
|
| 60 |
+
"input_norm/layer14": 0.0,
|
| 61 |
+
"input_norm/layer15": 0.0,
|
| 62 |
+
"input_norm/layer16": 0.0,
|
| 63 |
+
"input_norm/layer17": 0.0,
|
| 64 |
+
"input_norm/layer18": 0.0,
|
| 65 |
+
"input_norm/layer19": 0.0,
|
| 66 |
+
"input_norm/layer2": 0.0,
|
| 67 |
+
"input_norm/layer20": 0.0,
|
| 68 |
+
"input_norm/layer21": 0.0,
|
| 69 |
+
"input_norm/layer22": 0.0,
|
| 70 |
+
"input_norm/layer23": 0.0,
|
| 71 |
+
"input_norm/layer3": 0.0,
|
| 72 |
+
"input_norm/layer4": 0.0,
|
| 73 |
+
"input_norm/layer5": 0.0,
|
| 74 |
+
"input_norm/layer6": 0.0,
|
| 75 |
+
"input_norm/layer7": 0.0,
|
| 76 |
+
"input_norm/layer8": 0.0,
|
| 77 |
+
"input_norm/layer9": 0.0,
|
| 78 |
+
"max_norm": 45.539119720458984,
|
| 79 |
+
"max_norm/layer0": 34.44173049926758,
|
| 80 |
+
"max_norm/layer1": 36.61558151245117,
|
| 81 |
+
"max_norm/layer10": 38.54380416870117,
|
| 82 |
+
"max_norm/layer11": 34.865203857421875,
|
| 83 |
+
"max_norm/layer12": 40.908504486083984,
|
| 84 |
+
"max_norm/layer13": 35.78108215332031,
|
| 85 |
+
"max_norm/layer14": 36.67228317260742,
|
| 86 |
+
"max_norm/layer15": 45.083438873291016,
|
| 87 |
+
"max_norm/layer16": 36.927913665771484,
|
| 88 |
+
"max_norm/layer17": 45.539119720458984,
|
| 89 |
+
"max_norm/layer18": 39.2352409362793,
|
| 90 |
+
"max_norm/layer19": 38.779598236083984,
|
| 91 |
+
"max_norm/layer2": 26.836795806884766,
|
| 92 |
+
"max_norm/layer20": 38.50577163696289,
|
| 93 |
+
"max_norm/layer21": 38.87571334838867,
|
| 94 |
+
"max_norm/layer22": 39.42427062988281,
|
| 95 |
+
"max_norm/layer23": 37.21847915649414,
|
| 96 |
+
"max_norm/layer3": 34.34575271606445,
|
| 97 |
+
"max_norm/layer4": 34.4432258605957,
|
| 98 |
+
"max_norm/layer5": 44.077754974365234,
|
| 99 |
+
"max_norm/layer6": 28.6057071685791,
|
| 100 |
+
"max_norm/layer7": 37.91745376586914,
|
| 101 |
+
"max_norm/layer8": 36.69032287597656,
|
| 102 |
+
"max_norm/layer9": 37.08796691894531,
|
| 103 |
+
"mean_norm": 11.799732064207396,
|
| 104 |
+
"mean_norm/layer0": 11.755437850952148,
|
| 105 |
+
"mean_norm/layer1": 11.22901839017868,
|
| 106 |
+
"mean_norm/layer10": 11.532833635807037,
|
| 107 |
+
"mean_norm/layer11": 11.962444841861725,
|
| 108 |
+
"mean_norm/layer12": 12.79077160358429,
|
| 109 |
+
"mean_norm/layer13": 11.57960969209671,
|
| 110 |
+
"mean_norm/layer14": 12.059264957904816,
|
| 111 |
+
"mean_norm/layer15": 12.540440499782562,
|
| 112 |
+
"mean_norm/layer16": 11.641206741333008,
|
| 113 |
+
"mean_norm/layer17": 12.231300234794617,
|
| 114 |
+
"mean_norm/layer18": 11.600049555301666,
|
| 115 |
+
"mean_norm/layer19": 11.686796128749847,
|
| 116 |
+
"mean_norm/layer2": 9.256644666194916,
|
| 117 |
+
"mean_norm/layer20": 11.78922188282013,
|
| 118 |
+
"mean_norm/layer21": 11.759462356567383,
|
| 119 |
+
"mean_norm/layer22": 13.063357532024384,
|
| 120 |
+
"mean_norm/layer23": 13.022553265094757,
|
| 121 |
+
"mean_norm/layer3": 12.574194192886353,
|
| 122 |
+
"mean_norm/layer4": 10.863756775856018,
|
| 123 |
+
"mean_norm/layer5": 14.197384178638458,
|
| 124 |
+
"mean_norm/layer6": 10.185243308544159,
|
| 125 |
+
"mean_norm/layer7": 10.893572747707367,
|
| 126 |
+
"mean_norm/layer8": 11.53871750831604,
|
| 127 |
+
"mean_norm/layer9": 11.440286993980408,
|
| 128 |
+
"multicode_k": 8,
|
| 129 |
+
"output_norm": 0.0,
|
| 130 |
+
"output_norm/layer0": 0.0,
|
| 131 |
+
"output_norm/layer1": 0.0,
|
| 132 |
+
"output_norm/layer10": 0.0,
|
| 133 |
+
"output_norm/layer11": 0.0,
|
| 134 |
+
"output_norm/layer12": 0.0,
|
| 135 |
+
"output_norm/layer13": 0.0,
|
| 136 |
+
"output_norm/layer14": 0.0,
|
| 137 |
+
"output_norm/layer15": 0.0,
|
| 138 |
+
"output_norm/layer16": 0.0,
|
| 139 |
+
"output_norm/layer17": 0.0,
|
| 140 |
+
"output_norm/layer18": 0.0,
|
| 141 |
+
"output_norm/layer19": 0.0,
|
| 142 |
+
"output_norm/layer2": 0.0,
|
| 143 |
+
"output_norm/layer20": 0.0,
|
| 144 |
+
"output_norm/layer21": 0.0,
|
| 145 |
+
"output_norm/layer22": 0.0,
|
| 146 |
+
"output_norm/layer23": 0.0,
|
| 147 |
+
"output_norm/layer3": 0.0,
|
| 148 |
+
"output_norm/layer4": 0.0,
|
| 149 |
+
"output_norm/layer5": 0.0,
|
| 150 |
+
"output_norm/layer6": 0.0,
|
| 151 |
+
"output_norm/layer7": 0.0,
|
| 152 |
+
"output_norm/layer8": 0.0,
|
| 153 |
+
"output_norm/layer9": 0.0,
|
| 154 |
+
"train_loss": 2.685329116312663,
|
| 155 |
+
"train_runtime": 43939.9354,
|
| 156 |
+
"train_samples": 114937,
|
| 157 |
+
"train_samples_per_second": 16.386,
|
| 158 |
+
"train_steps_per_second": 0.341
|
| 159 |
+
}
|
trainer_state.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
training_args.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:974e67b66201b847274c72f8bccd37bc28a91bd779b977a46504b96111e57b61
|
| 3 |
+
size 3771
|