Add files

Browse files

Files changed (10) hide show

README.md +0 -0
all_results.json +159 -0
config.json +50 -0
pytorch_model.bin +3 -0
special_tokens_map.json +5 -0
tokenizer.json +0 -0
tokenizer_config.json +9 -0
train_results.json +159 -0
trainer_state.json +0 -0
training_args.bin +3 -0

README.md CHANGED Viewed

The diff for this file is too large to render. See raw diff

all_results.json ADDED Viewed

	@@ -0,0 +1,159 @@

+{
+    "MSE": 0.0,
+    "MSE/layer0": 0.0,
+    "MSE/layer1": 0.0,
+    "MSE/layer10": 0.0,
+    "MSE/layer11": 0.0,
+    "MSE/layer12": 0.0,
+    "MSE/layer13": 0.0,
+    "MSE/layer14": 0.0,
+    "MSE/layer15": 0.0,
+    "MSE/layer16": 0.0,
+    "MSE/layer17": 0.0,
+    "MSE/layer18": 0.0,
+    "MSE/layer19": 0.0,
+    "MSE/layer2": 0.0,
+    "MSE/layer20": 0.0,
+    "MSE/layer21": 0.0,
+    "MSE/layer22": 0.0,
+    "MSE/layer23": 0.0,
+    "MSE/layer3": 0.0,
+    "MSE/layer4": 0.0,
+    "MSE/layer5": 0.0,
+    "MSE/layer6": 0.0,
+    "MSE/layer7": 0.0,
+    "MSE/layer8": 0.0,
+    "MSE/layer9": 0.0,
+    "dead_code_fraction": 1.0,
+    "dead_code_fraction/layer0": 1.0,
+    "dead_code_fraction/layer1": 1.0,
+    "dead_code_fraction/layer10": 1.0,
+    "dead_code_fraction/layer11": 1.0,
+    "dead_code_fraction/layer12": 1.0,
+    "dead_code_fraction/layer13": 1.0,
+    "dead_code_fraction/layer14": 1.0,
+    "dead_code_fraction/layer15": 1.0,
+    "dead_code_fraction/layer16": 1.0,
+    "dead_code_fraction/layer17": 1.0,
+    "dead_code_fraction/layer18": 1.0,
+    "dead_code_fraction/layer19": 1.0,
+    "dead_code_fraction/layer2": 1.0,
+    "dead_code_fraction/layer20": 1.0,
+    "dead_code_fraction/layer21": 1.0,
+    "dead_code_fraction/layer22": 1.0,
+    "dead_code_fraction/layer23": 1.0,
+    "dead_code_fraction/layer3": 1.0,
+    "dead_code_fraction/layer4": 1.0,
+    "dead_code_fraction/layer5": 1.0,
+    "dead_code_fraction/layer6": 1.0,
+    "dead_code_fraction/layer7": 1.0,
+    "dead_code_fraction/layer8": 1.0,
+    "dead_code_fraction/layer9": 1.0,
+    "epoch": 6.26,
+    "input_norm": 0.0,
+    "input_norm/layer0": 0.0,
+    "input_norm/layer1": 0.0,
+    "input_norm/layer10": 0.0,
+    "input_norm/layer11": 0.0,
+    "input_norm/layer12": 0.0,
+    "input_norm/layer13": 0.0,
+    "input_norm/layer14": 0.0,
+    "input_norm/layer15": 0.0,
+    "input_norm/layer16": 0.0,
+    "input_norm/layer17": 0.0,
+    "input_norm/layer18": 0.0,
+    "input_norm/layer19": 0.0,
+    "input_norm/layer2": 0.0,
+    "input_norm/layer20": 0.0,
+    "input_norm/layer21": 0.0,
+    "input_norm/layer22": 0.0,
+    "input_norm/layer23": 0.0,
+    "input_norm/layer3": 0.0,
+    "input_norm/layer4": 0.0,
+    "input_norm/layer5": 0.0,
+    "input_norm/layer6": 0.0,
+    "input_norm/layer7": 0.0,
+    "input_norm/layer8": 0.0,
+    "input_norm/layer9": 0.0,
+    "max_norm": 45.539119720458984,
+    "max_norm/layer0": 34.44173049926758,
+    "max_norm/layer1": 36.61558151245117,
+    "max_norm/layer10": 38.54380416870117,
+    "max_norm/layer11": 34.865203857421875,
+    "max_norm/layer12": 40.908504486083984,
+    "max_norm/layer13": 35.78108215332031,
+    "max_norm/layer14": 36.67228317260742,
+    "max_norm/layer15": 45.083438873291016,
+    "max_norm/layer16": 36.927913665771484,
+    "max_norm/layer17": 45.539119720458984,
+    "max_norm/layer18": 39.2352409362793,
+    "max_norm/layer19": 38.779598236083984,
+    "max_norm/layer2": 26.836795806884766,
+    "max_norm/layer20": 38.50577163696289,
+    "max_norm/layer21": 38.87571334838867,
+    "max_norm/layer22": 39.42427062988281,
+    "max_norm/layer23": 37.21847915649414,
+    "max_norm/layer3": 34.34575271606445,
+    "max_norm/layer4": 34.4432258605957,
+    "max_norm/layer5": 44.077754974365234,
+    "max_norm/layer6": 28.6057071685791,
+    "max_norm/layer7": 37.91745376586914,
+    "max_norm/layer8": 36.69032287597656,
+    "max_norm/layer9": 37.08796691894531,
+    "mean_norm": 11.799732064207396,
+    "mean_norm/layer0": 11.755437850952148,
+    "mean_norm/layer1": 11.22901839017868,
+    "mean_norm/layer10": 11.532833635807037,
+    "mean_norm/layer11": 11.962444841861725,
+    "mean_norm/layer12": 12.79077160358429,
+    "mean_norm/layer13": 11.57960969209671,
+    "mean_norm/layer14": 12.059264957904816,
+    "mean_norm/layer15": 12.540440499782562,
+    "mean_norm/layer16": 11.641206741333008,
+    "mean_norm/layer17": 12.231300234794617,
+    "mean_norm/layer18": 11.600049555301666,
+    "mean_norm/layer19": 11.686796128749847,
+    "mean_norm/layer2": 9.256644666194916,
+    "mean_norm/layer20": 11.78922188282013,
+    "mean_norm/layer21": 11.759462356567383,
+    "mean_norm/layer22": 13.063357532024384,
+    "mean_norm/layer23": 13.022553265094757,
+    "mean_norm/layer3": 12.574194192886353,
+    "mean_norm/layer4": 10.863756775856018,
+    "mean_norm/layer5": 14.197384178638458,
+    "mean_norm/layer6": 10.185243308544159,
+    "mean_norm/layer7": 10.893572747707367,
+    "mean_norm/layer8": 11.53871750831604,
+    "mean_norm/layer9": 11.440286993980408,
+    "multicode_k": 8,
+    "output_norm": 0.0,
+    "output_norm/layer0": 0.0,
+    "output_norm/layer1": 0.0,
+    "output_norm/layer10": 0.0,
+    "output_norm/layer11": 0.0,
+    "output_norm/layer12": 0.0,
+    "output_norm/layer13": 0.0,
+    "output_norm/layer14": 0.0,
+    "output_norm/layer15": 0.0,
+    "output_norm/layer16": 0.0,
+    "output_norm/layer17": 0.0,
+    "output_norm/layer18": 0.0,
+    "output_norm/layer19": 0.0,
+    "output_norm/layer2": 0.0,
+    "output_norm/layer20": 0.0,
+    "output_norm/layer21": 0.0,
+    "output_norm/layer22": 0.0,
+    "output_norm/layer23": 0.0,
+    "output_norm/layer3": 0.0,
+    "output_norm/layer4": 0.0,
+    "output_norm/layer5": 0.0,
+    "output_norm/layer6": 0.0,
+    "output_norm/layer7": 0.0,
+    "output_norm/layer8": 0.0,
+    "output_norm/layer9": 0.0,
+    "train_loss": 2.685329116312663,
+    "train_runtime": 43939.9354,
+    "train_samples": 114937,
+    "train_samples_per_second": 16.386,
+    "train_steps_per_second": 0.341
+}

config.json ADDED Viewed

	@@ -0,0 +1,50 @@

+{
+  "architectures": [
+    "GPTNeoXCodebookModel"
+  ],
+  "codebook_at": [
+    "attn_preproj"
+  ],
+  "codebook_type": "group",
+  "k_codebook": 8,
+  "kmeans_init": false,
+  "kmeans_init_examples": 1000,
+  "kmeans_kwargs": {
+    "batch_size": 24576,
+    "n_init": "auto"
+  },
+  "kmeans_path": "/.cache/cb_volume/huggingface/kmeans_embeddings.pt",
+  "layers_to_snap": [
+    0,
+    1,
+    2,
+    3,
+    4,
+    5,
+    6,
+    7,
+    8,
+    9,
+    10,
+    11,
+    12,
+    13,
+    14,
+    15,
+    16,
+    17,
+    18,
+    19,
+    20,
+    21,
+    22,
+    23
+  ],
+  "loss": "aeloss",
+  "model_type": "codebook",
+  "num_codebooks": 16,
+  "num_codes": 10000,
+  "similarity_metric": "inner_product",
+  "torch_dtype": "float32",
+  "transformers_version": "4.27.3"
+}

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:047349fb750aba188ba5b471a0c8e518b74984623d17524d02475169d29d4a3d
+size 2705783745

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "bos_token": "<|endoftext|>",
+  "eos_token": "<|endoftext|>",
+  "unk_token": "<|endoftext|>"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "add_prefix_space": false,
+  "bos_token": "<|endoftext|>",
+  "eos_token": "<|endoftext|>",
+  "model_max_length": 1000000000000000019884624838656,
+  "special_tokens_map_file": "/admin/home-hailey/.cache/huggingface/hub/models--EleutherAI--gpt-neox-20b/snapshots/4e49eadb5d14bd22f314ec3f45b69a87b88c7691/special_tokens_map.json",
+  "tokenizer_class": "GPTNeoXTokenizer",
+  "unk_token": "<|endoftext|>"
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,159 @@

+{
+    "MSE": 0.0,
+    "MSE/layer0": 0.0,
+    "MSE/layer1": 0.0,
+    "MSE/layer10": 0.0,
+    "MSE/layer11": 0.0,
+    "MSE/layer12": 0.0,
+    "MSE/layer13": 0.0,
+    "MSE/layer14": 0.0,
+    "MSE/layer15": 0.0,
+    "MSE/layer16": 0.0,
+    "MSE/layer17": 0.0,
+    "MSE/layer18": 0.0,
+    "MSE/layer19": 0.0,
+    "MSE/layer2": 0.0,
+    "MSE/layer20": 0.0,
+    "MSE/layer21": 0.0,
+    "MSE/layer22": 0.0,
+    "MSE/layer23": 0.0,
+    "MSE/layer3": 0.0,
+    "MSE/layer4": 0.0,
+    "MSE/layer5": 0.0,
+    "MSE/layer6": 0.0,
+    "MSE/layer7": 0.0,
+    "MSE/layer8": 0.0,
+    "MSE/layer9": 0.0,
+    "dead_code_fraction": 1.0,
+    "dead_code_fraction/layer0": 1.0,
+    "dead_code_fraction/layer1": 1.0,
+    "dead_code_fraction/layer10": 1.0,
+    "dead_code_fraction/layer11": 1.0,
+    "dead_code_fraction/layer12": 1.0,
+    "dead_code_fraction/layer13": 1.0,
+    "dead_code_fraction/layer14": 1.0,
+    "dead_code_fraction/layer15": 1.0,
+    "dead_code_fraction/layer16": 1.0,
+    "dead_code_fraction/layer17": 1.0,
+    "dead_code_fraction/layer18": 1.0,
+    "dead_code_fraction/layer19": 1.0,
+    "dead_code_fraction/layer2": 1.0,
+    "dead_code_fraction/layer20": 1.0,
+    "dead_code_fraction/layer21": 1.0,
+    "dead_code_fraction/layer22": 1.0,
+    "dead_code_fraction/layer23": 1.0,
+    "dead_code_fraction/layer3": 1.0,
+    "dead_code_fraction/layer4": 1.0,
+    "dead_code_fraction/layer5": 1.0,
+    "dead_code_fraction/layer6": 1.0,
+    "dead_code_fraction/layer7": 1.0,
+    "dead_code_fraction/layer8": 1.0,
+    "dead_code_fraction/layer9": 1.0,
+    "epoch": 6.26,
+    "input_norm": 0.0,
+    "input_norm/layer0": 0.0,
+    "input_norm/layer1": 0.0,
+    "input_norm/layer10": 0.0,
+    "input_norm/layer11": 0.0,
+    "input_norm/layer12": 0.0,
+    "input_norm/layer13": 0.0,
+    "input_norm/layer14": 0.0,
+    "input_norm/layer15": 0.0,
+    "input_norm/layer16": 0.0,
+    "input_norm/layer17": 0.0,
+    "input_norm/layer18": 0.0,
+    "input_norm/layer19": 0.0,
+    "input_norm/layer2": 0.0,
+    "input_norm/layer20": 0.0,
+    "input_norm/layer21": 0.0,
+    "input_norm/layer22": 0.0,
+    "input_norm/layer23": 0.0,
+    "input_norm/layer3": 0.0,
+    "input_norm/layer4": 0.0,
+    "input_norm/layer5": 0.0,
+    "input_norm/layer6": 0.0,
+    "input_norm/layer7": 0.0,
+    "input_norm/layer8": 0.0,
+    "input_norm/layer9": 0.0,
+    "max_norm": 45.539119720458984,
+    "max_norm/layer0": 34.44173049926758,
+    "max_norm/layer1": 36.61558151245117,
+    "max_norm/layer10": 38.54380416870117,
+    "max_norm/layer11": 34.865203857421875,
+    "max_norm/layer12": 40.908504486083984,
+    "max_norm/layer13": 35.78108215332031,
+    "max_norm/layer14": 36.67228317260742,
+    "max_norm/layer15": 45.083438873291016,
+    "max_norm/layer16": 36.927913665771484,
+    "max_norm/layer17": 45.539119720458984,
+    "max_norm/layer18": 39.2352409362793,
+    "max_norm/layer19": 38.779598236083984,
+    "max_norm/layer2": 26.836795806884766,
+    "max_norm/layer20": 38.50577163696289,
+    "max_norm/layer21": 38.87571334838867,
+    "max_norm/layer22": 39.42427062988281,
+    "max_norm/layer23": 37.21847915649414,
+    "max_norm/layer3": 34.34575271606445,
+    "max_norm/layer4": 34.4432258605957,
+    "max_norm/layer5": 44.077754974365234,
+    "max_norm/layer6": 28.6057071685791,
+    "max_norm/layer7": 37.91745376586914,
+    "max_norm/layer8": 36.69032287597656,
+    "max_norm/layer9": 37.08796691894531,
+    "mean_norm": 11.799732064207396,
+    "mean_norm/layer0": 11.755437850952148,
+    "mean_norm/layer1": 11.22901839017868,
+    "mean_norm/layer10": 11.532833635807037,
+    "mean_norm/layer11": 11.962444841861725,
+    "mean_norm/layer12": 12.79077160358429,
+    "mean_norm/layer13": 11.57960969209671,
+    "mean_norm/layer14": 12.059264957904816,
+    "mean_norm/layer15": 12.540440499782562,
+    "mean_norm/layer16": 11.641206741333008,
+    "mean_norm/layer17": 12.231300234794617,
+    "mean_norm/layer18": 11.600049555301666,
+    "mean_norm/layer19": 11.686796128749847,
+    "mean_norm/layer2": 9.256644666194916,
+    "mean_norm/layer20": 11.78922188282013,
+    "mean_norm/layer21": 11.759462356567383,
+    "mean_norm/layer22": 13.063357532024384,
+    "mean_norm/layer23": 13.022553265094757,
+    "mean_norm/layer3": 12.574194192886353,
+    "mean_norm/layer4": 10.863756775856018,
+    "mean_norm/layer5": 14.197384178638458,
+    "mean_norm/layer6": 10.185243308544159,
+    "mean_norm/layer7": 10.893572747707367,
+    "mean_norm/layer8": 11.53871750831604,
+    "mean_norm/layer9": 11.440286993980408,
+    "multicode_k": 8,
+    "output_norm": 0.0,
+    "output_norm/layer0": 0.0,
+    "output_norm/layer1": 0.0,
+    "output_norm/layer10": 0.0,
+    "output_norm/layer11": 0.0,
+    "output_norm/layer12": 0.0,
+    "output_norm/layer13": 0.0,
+    "output_norm/layer14": 0.0,
+    "output_norm/layer15": 0.0,
+    "output_norm/layer16": 0.0,
+    "output_norm/layer17": 0.0,
+    "output_norm/layer18": 0.0,
+    "output_norm/layer19": 0.0,
+    "output_norm/layer2": 0.0,
+    "output_norm/layer20": 0.0,
+    "output_norm/layer21": 0.0,
+    "output_norm/layer22": 0.0,
+    "output_norm/layer23": 0.0,
+    "output_norm/layer3": 0.0,
+    "output_norm/layer4": 0.0,
+    "output_norm/layer5": 0.0,
+    "output_norm/layer6": 0.0,
+    "output_norm/layer7": 0.0,
+    "output_norm/layer8": 0.0,
+    "output_norm/layer9": 0.0,
+    "train_loss": 2.685329116312663,
+    "train_runtime": 43939.9354,
+    "train_samples": 114937,
+    "train_samples_per_second": 16.386,
+    "train_steps_per_second": 0.341
+}

trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:974e67b66201b847274c72f8bccd37bc28a91bd779b977a46504b96111e57b61
+size 3771