| [WARNING|2024-12-04 13:56:21] logging.py:162 >> `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training. | |
| [INFO|2024-12-04 13:56:21] logging.py:157 >> Resuming training from saves/Llama-3.2-3B-Instruct/lora/train_llama-pii/checkpoint-100. | |
| [INFO|2024-12-04 13:56:21] logging.py:157 >> Change `output_dir` or use `overwrite_output_dir` to avoid. | |
| [INFO|2024-12-04 13:56:21] parser.py:355 >> Process rank: 0, device: cuda:0, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16 | |
| [INFO|2024-12-04 13:56:21] configuration_utils.py:733 >> loading configuration file config.json from cache at /home/jiaheng/.cache/huggingface/hub/models--meta-llama--Llama-3.2-3B-Instruct/snapshots/0cb88a4f764b7a12671c53f0838cd831a0843b95/config.json | |
| [INFO|2024-12-04 13:56:21] configuration_utils.py:800 >> Model config LlamaConfig { | |
| "_name_or_path": "meta-llama/Llama-3.2-3B-Instruct", | |
| "architectures": [ | |
| "LlamaForCausalLM" | |
| ], | |
| "attention_bias": false, | |
| "attention_dropout": 0.0, | |
| "bos_token_id": 128000, | |
| "eos_token_id": [ | |
| 128001, | |
| 128008, | |
| 128009 | |
| ], | |
| "head_dim": 128, | |
| "hidden_act": "silu", | |
| "hidden_size": 3072, | |
| "initializer_range": 0.02, | |
| "intermediate_size": 8192, | |
| "max_position_embeddings": 131072, | |
| "mlp_bias": false, | |
| "model_type": "llama", | |
| "num_attention_heads": 24, | |
| "num_hidden_layers": 28, | |
| "num_key_value_heads": 8, | |
| "pretraining_tp": 1, | |
| "rms_norm_eps": 1e-05, | |
| "rope_scaling": { | |
| "factor": 32.0, | |
| "high_freq_factor": 4.0, | |
| "low_freq_factor": 1.0, | |
| "original_max_position_embeddings": 8192, | |
| "rope_type": "llama3" | |
| }, | |
| "rope_theta": 500000.0, | |
| "tie_word_embeddings": true, | |
| "torch_dtype": "bfloat16", | |
| "transformers_version": "4.43.1", | |
| "use_cache": true, | |
| "vocab_size": 128256 | |
| } | |
| [INFO|2024-12-04 13:56:21] parser.py:355 >> Process rank: 1, device: cuda:1, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16 | |
| [INFO|2024-12-04 13:56:22] parser.py:355 >> Process rank: 5, device: cuda:5, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16 | |
| [INFO|2024-12-04 13:56:22] parser.py:355 >> Process rank: 2, device: cuda:2, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16 | |
| [INFO|2024-12-04 13:56:22] parser.py:355 >> Process rank: 3, device: cuda:3, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16 | |
| [INFO|2024-12-04 13:56:22] parser.py:355 >> Process rank: 4, device: cuda:4, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16 | |
| [INFO|2024-12-04 13:56:22] parser.py:355 >> Process rank: 6, device: cuda:6, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16 | |
| [INFO|2024-12-04 13:56:22] tokenization_utils_base.py:2289 >> loading file tokenizer.json from cache at /home/jiaheng/.cache/huggingface/hub/models--meta-llama--Llama-3.2-3B-Instruct/snapshots/0cb88a4f764b7a12671c53f0838cd831a0843b95/tokenizer.json | |
| [INFO|2024-12-04 13:56:22] tokenization_utils_base.py:2289 >> loading file added_tokens.json from cache at None | |
| [INFO|2024-12-04 13:56:22] tokenization_utils_base.py:2289 >> loading file special_tokens_map.json from cache at /home/jiaheng/.cache/huggingface/hub/models--meta-llama--Llama-3.2-3B-Instruct/snapshots/0cb88a4f764b7a12671c53f0838cd831a0843b95/special_tokens_map.json | |
| [INFO|2024-12-04 13:56:22] tokenization_utils_base.py:2289 >> loading file tokenizer_config.json from cache at /home/jiaheng/.cache/huggingface/hub/models--meta-llama--Llama-3.2-3B-Instruct/snapshots/0cb88a4f764b7a12671c53f0838cd831a0843b95/tokenizer_config.json | |
| [INFO|2024-12-04 13:56:22] tokenization_utils_base.py:2533 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. | |
| [INFO|2024-12-04 13:56:22] configuration_utils.py:733 >> loading configuration file config.json from cache at /home/jiaheng/.cache/huggingface/hub/models--meta-llama--Llama-3.2-3B-Instruct/snapshots/0cb88a4f764b7a12671c53f0838cd831a0843b95/config.json | |
| [INFO|2024-12-04 13:56:22] configuration_utils.py:800 >> Model config LlamaConfig { | |
| "_name_or_path": "meta-llama/Llama-3.2-3B-Instruct", | |
| "architectures": [ | |
| "LlamaForCausalLM" | |
| ], | |
| "attention_bias": false, | |
| "attention_dropout": 0.0, | |
| "bos_token_id": 128000, | |
| "eos_token_id": [ | |
| 128001, | |
| 128008, | |
| 128009 | |
| ], | |
| "head_dim": 128, | |
| "hidden_act": "silu", | |
| "hidden_size": 3072, | |
| "initializer_range": 0.02, | |
| "intermediate_size": 8192, | |
| "max_position_embeddings": 131072, | |
| "mlp_bias": false, | |
| "model_type": "llama", | |
| "num_attention_heads": 24, | |
| "num_hidden_layers": 28, | |
| "num_key_value_heads": 8, | |
| "pretraining_tp": 1, | |
| "rms_norm_eps": 1e-05, | |
| "rope_scaling": { | |
| "factor": 32.0, | |
| "high_freq_factor": 4.0, | |
| "low_freq_factor": 1.0, | |
| "original_max_position_embeddings": 8192, | |
| "rope_type": "llama3" | |
| }, | |
| "rope_theta": 500000.0, | |
| "tie_word_embeddings": true, | |
| "torch_dtype": "bfloat16", | |
| "transformers_version": "4.43.1", | |
| "use_cache": true, | |
| "vocab_size": 128256 | |
| } | |
| [INFO|2024-12-04 13:56:22] tokenization_utils_base.py:2289 >> loading file tokenizer.json from cache at /home/jiaheng/.cache/huggingface/hub/models--meta-llama--Llama-3.2-3B-Instruct/snapshots/0cb88a4f764b7a12671c53f0838cd831a0843b95/tokenizer.json | |
| [INFO|2024-12-04 13:56:22] tokenization_utils_base.py:2289 >> loading file added_tokens.json from cache at None | |
| [INFO|2024-12-04 13:56:22] tokenization_utils_base.py:2289 >> loading file special_tokens_map.json from cache at /home/jiaheng/.cache/huggingface/hub/models--meta-llama--Llama-3.2-3B-Instruct/snapshots/0cb88a4f764b7a12671c53f0838cd831a0843b95/special_tokens_map.json | |
| [INFO|2024-12-04 13:56:22] tokenization_utils_base.py:2289 >> loading file tokenizer_config.json from cache at /home/jiaheng/.cache/huggingface/hub/models--meta-llama--Llama-3.2-3B-Instruct/snapshots/0cb88a4f764b7a12671c53f0838cd831a0843b95/tokenizer_config.json | |
| [INFO|2024-12-04 13:56:23] tokenization_utils_base.py:2533 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. | |
| [INFO|2024-12-04 13:56:23] logging.py:157 >> Replace eos token: <|eot_id|> | |
| [INFO|2024-12-04 13:56:23] logging.py:157 >> Add pad token: <|eot_id|> | |
| [INFO|2024-12-04 13:56:23] logging.py:157 >> Loading dataset formatted_dataset.json... | |
| [INFO|2024-12-04 13:56:25] configuration_utils.py:733 >> loading configuration file config.json from cache at /home/jiaheng/.cache/huggingface/hub/models--meta-llama--Llama-3.2-3B-Instruct/snapshots/0cb88a4f764b7a12671c53f0838cd831a0843b95/config.json | |
| [INFO|2024-12-04 13:56:25] configuration_utils.py:800 >> Model config LlamaConfig { | |
| "_name_or_path": "meta-llama/Llama-3.2-3B-Instruct", | |
| "architectures": [ | |
| "LlamaForCausalLM" | |
| ], | |
| "attention_bias": false, | |
| "attention_dropout": 0.0, | |
| "bos_token_id": 128000, | |
| "eos_token_id": [ | |
| 128001, | |
| 128008, | |
| 128009 | |
| ], | |
| "head_dim": 128, | |
| "hidden_act": "silu", | |
| "hidden_size": 3072, | |
| "initializer_range": 0.02, | |
| "intermediate_size": 8192, | |
| "max_position_embeddings": 131072, | |
| "mlp_bias": false, | |
| "model_type": "llama", | |
| "num_attention_heads": 24, | |
| "num_hidden_layers": 28, | |
| "num_key_value_heads": 8, | |
| "pretraining_tp": 1, | |
| "rms_norm_eps": 1e-05, | |
| "rope_scaling": { | |
| "factor": 32.0, | |
| "high_freq_factor": 4.0, | |
| "low_freq_factor": 1.0, | |
| "original_max_position_embeddings": 8192, | |
| "rope_type": "llama3" | |
| }, | |
| "rope_theta": 500000.0, | |
| "tie_word_embeddings": true, | |
| "torch_dtype": "bfloat16", | |
| "transformers_version": "4.43.1", | |
| "use_cache": true, | |
| "vocab_size": 128256 | |
| } | |
| [INFO|2024-12-04 13:56:25] modeling_utils.py:3621 >> loading weights file model.safetensors from cache at /home/jiaheng/.cache/huggingface/hub/models--meta-llama--Llama-3.2-3B-Instruct/snapshots/0cb88a4f764b7a12671c53f0838cd831a0843b95/model.safetensors.index.json | |
| [INFO|2024-12-04 13:56:25] modeling_utils.py:1569 >> Instantiating LlamaForCausalLM model under default dtype torch.bfloat16. | |
| [INFO|2024-12-04 13:56:25] configuration_utils.py:1038 >> Generate config GenerationConfig { | |
| "bos_token_id": 128000, | |
| "eos_token_id": [ | |
| 128001, | |
| 128008, | |
| 128009 | |
| ] | |
| } | |
| [INFO|2024-12-04 13:56:28] modeling_utils.py:4450 >> All model checkpoint weights were used when initializing LlamaForCausalLM. | |
| [INFO|2024-12-04 13:56:28] modeling_utils.py:4458 >> All the weights of LlamaForCausalLM were initialized from the model checkpoint at meta-llama/Llama-3.2-3B-Instruct. | |
| If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. | |
| [INFO|2024-12-04 13:56:28] configuration_utils.py:993 >> loading configuration file generation_config.json from cache at /home/jiaheng/.cache/huggingface/hub/models--meta-llama--Llama-3.2-3B-Instruct/snapshots/0cb88a4f764b7a12671c53f0838cd831a0843b95/generation_config.json | |
| [INFO|2024-12-04 13:56:28] configuration_utils.py:1038 >> Generate config GenerationConfig { | |
| "bos_token_id": 128000, | |
| "do_sample": true, | |
| "eos_token_id": [ | |
| 128001, | |
| 128008, | |
| 128009 | |
| ], | |
| "temperature": 0.6, | |
| "top_p": 0.9 | |
| } | |
| [INFO|2024-12-04 13:56:28] logging.py:157 >> Gradient checkpointing enabled. | |
| [INFO|2024-12-04 13:56:28] logging.py:157 >> Using torch SDPA for faster training and inference. | |
| [INFO|2024-12-04 13:56:28] logging.py:157 >> Upcasting trainable params to float32. | |
| [INFO|2024-12-04 13:56:28] logging.py:157 >> Fine-tuning method: LoRA | |
| [INFO|2024-12-04 13:56:28] logging.py:157 >> Found linear modules: q_proj,gate_proj,down_proj,up_proj,o_proj,v_proj,k_proj | |
| [INFO|2024-12-04 13:56:29] logging.py:157 >> trainable params: 12,156,928 || all params: 3,224,906,752 || trainable%: 0.3770 | |
| [INFO|2024-12-04 13:56:29] trainer.py:648 >> Using auto half precision backend | |
| [INFO|2024-12-04 13:56:29] trainer.py:2526 >> Loading model from saves/Llama-3.2-3B-Instruct/lora/train_llama-pii/checkpoint-100. | |
| [INFO|2024-12-04 13:56:32] trainer.py:2134 >> ***** Running training ***** | |
| [INFO|2024-12-04 13:56:32] trainer.py:2135 >> Num examples = 9,507 | |
| [INFO|2024-12-04 13:56:32] trainer.py:2136 >> Num Epochs = 3 | |
| [INFO|2024-12-04 13:56:32] trainer.py:2137 >> Instantaneous batch size per device = 2 | |
| [INFO|2024-12-04 13:56:32] trainer.py:2140 >> Total train batch size (w. parallel, distributed & accumulation) = 128 | |
| [INFO|2024-12-04 13:56:32] trainer.py:2141 >> Gradient Accumulation steps = 8 | |
| [INFO|2024-12-04 13:56:32] trainer.py:2142 >> Total optimization steps = 222 | |
| [INFO|2024-12-04 13:56:32] trainer.py:2143 >> Number of trainable parameters = 12,156,928 | |
| [INFO|2024-12-04 13:56:32] trainer.py:2165 >> Continuing training from checkpoint, will skip to saved global_step | |
| [INFO|2024-12-04 13:56:32] trainer.py:2166 >> Continuing training from epoch 1 | |
| [INFO|2024-12-04 13:56:32] trainer.py:2167 >> Continuing training from global step 100 | |
| [INFO|2024-12-04 13:56:32] trainer.py:2169 >> Will skip the first 1 epochs then the first 208 batches in the first epoch. | |
| [INFO|2024-12-04 13:57:13] logging.py:157 >> {'loss': 1.0489, 'learning_rate': 2.7120e-05, 'epoch': 1.42, 'throughput': 152046.66} | |
| [INFO|2024-12-04 13:57:54] logging.py:157 >> {'loss': 1.0607, 'learning_rate': 2.5354e-05, 'epoch': 1.48, 'throughput': 79588.83} | |
| [INFO|2024-12-04 13:58:35] logging.py:157 >> {'loss': 1.0722, 'learning_rate': 2.3586e-05, 'epoch': 1.55, 'throughput': 55727.70} | |
| [INFO|2024-12-04 13:59:17] logging.py:157 >> {'loss': 1.0662, 'learning_rate': 2.1825e-05, 'epoch': 1.62, 'throughput': 43302.60} | |
| [INFO|2024-12-04 14:00:00] logging.py:157 >> {'loss': 1.0548, 'learning_rate': 2.0079e-05, 'epoch': 1.69, 'throughput': 35890.41} | |
| [INFO|2024-12-04 14:00:41] logging.py:157 >> {'loss': 1.0645, 'learning_rate': 1.8359e-05, 'epoch': 1.75, 'throughput': 31092.14} | |
| [INFO|2024-12-04 14:01:20] logging.py:157 >> {'loss': 1.0671, 'learning_rate': 1.6672e-05, 'epoch': 1.82, 'throughput': 27982.89} | |
| [INFO|2024-12-04 14:01:59] logging.py:157 >> {'loss': 1.0775, 'learning_rate': 1.5026e-05, 'epoch': 1.89, 'throughput': 25513.14} | |
| [INFO|2024-12-04 14:02:39] logging.py:157 >> {'loss': 1.0781, 'learning_rate': 1.3430e-05, 'epoch': 1.95, 'throughput': 23537.72} | |
| [INFO|2024-12-04 14:03:19] logging.py:157 >> {'loss': 1.0598, 'learning_rate': 1.1892e-05, 'epoch': 2.02, 'throughput': 21993.43} | |
| [INFO|2024-12-04 14:03:59] logging.py:157 >> {'loss': 1.0507, 'learning_rate': 1.0420e-05, 'epoch': 2.09, 'throughput': 20677.45} | |
| [INFO|2024-12-04 14:04:39] logging.py:157 >> {'loss': 1.0532, 'learning_rate': 9.0208e-06, 'epoch': 2.16, 'throughput': 19586.24} | |
| [INFO|2024-12-04 14:05:19] logging.py:157 >> {'loss': 1.0589, 'learning_rate': 7.7015e-06, 'epoch': 2.22, 'throughput': 18691.67} | |
| [INFO|2024-12-04 14:05:58] logging.py:157 >> {'loss': 1.0340, 'learning_rate': 6.4688e-06, 'epoch': 2.29, 'throughput': 17909.19} | |
| [INFO|2024-12-04 14:06:37] logging.py:157 >> {'loss': 1.0442, 'learning_rate': 5.3288e-06, 'epoch': 2.36, 'throughput': 17276.50} | |
| [INFO|2024-12-04 14:07:16] logging.py:157 >> {'loss': 1.0579, 'learning_rate': 4.2873e-06, 'epoch': 2.43, 'throughput': 16671.13} | |
| [INFO|2024-12-04 14:07:57] logging.py:157 >> {'loss': 1.0397, 'learning_rate': 3.3494e-06, 'epoch': 2.49, 'throughput': 16124.99} | |
| [INFO|2024-12-04 14:08:37] logging.py:157 >> {'loss': 1.0249, 'learning_rate': 2.5198e-06, 'epoch': 2.56, 'throughput': 15650.52} | |
| [INFO|2024-12-04 14:09:17] logging.py:157 >> {'loss': 1.0343, 'learning_rate': 1.8028e-06, 'epoch': 2.63, 'throughput': 15222.33} | |
| [INFO|2024-12-04 14:09:58] logging.py:157 >> {'loss': 1.0450, 'learning_rate': 1.2018e-06, 'epoch': 2.69, 'throughput': 14816.79} | |
| [INFO|2024-12-04 14:09:58] trainer.py:3503 >> Saving model checkpoint to saves/Llama-3.2-3B-Instruct/lora/train_llama-pii/checkpoint-200 | |
| [INFO|2024-12-04 14:09:58] configuration_utils.py:733 >> loading configuration file config.json from cache at /home/jiaheng/.cache/huggingface/hub/models--meta-llama--Llama-3.2-3B-Instruct/snapshots/0cb88a4f764b7a12671c53f0838cd831a0843b95/config.json | |
| [INFO|2024-12-04 14:09:58] configuration_utils.py:800 >> Model config LlamaConfig { | |
| "architectures": [ | |
| "LlamaForCausalLM" | |
| ], | |
| "attention_bias": false, | |
| "attention_dropout": 0.0, | |
| "bos_token_id": 128000, | |
| "eos_token_id": [ | |
| 128001, | |
| 128008, | |
| 128009 | |
| ], | |
| "head_dim": 128, | |
| "hidden_act": "silu", | |
| "hidden_size": 3072, | |
| "initializer_range": 0.02, | |
| "intermediate_size": 8192, | |
| "max_position_embeddings": 131072, | |
| "mlp_bias": false, | |
| "model_type": "llama", | |
| "num_attention_heads": 24, | |
| "num_hidden_layers": 28, | |
| "num_key_value_heads": 8, | |
| "pretraining_tp": 1, | |
| "rms_norm_eps": 1e-05, | |
| "rope_scaling": { | |
| "factor": 32.0, | |
| "high_freq_factor": 4.0, | |
| "low_freq_factor": 1.0, | |
| "original_max_position_embeddings": 8192, | |
| "rope_type": "llama3" | |
| }, | |
| "rope_theta": 500000.0, | |
| "tie_word_embeddings": true, | |
| "torch_dtype": "bfloat16", | |
| "transformers_version": "4.43.1", | |
| "use_cache": true, | |
| "vocab_size": 128256 | |
| } | |
| [INFO|2024-12-04 14:09:58] tokenization_utils_base.py:2702 >> tokenizer config file saved in saves/Llama-3.2-3B-Instruct/lora/train_llama-pii/checkpoint-200/tokenizer_config.json | |
| [INFO|2024-12-04 14:09:58] tokenization_utils_base.py:2711 >> Special tokens file saved in saves/Llama-3.2-3B-Instruct/lora/train_llama-pii/checkpoint-200/special_tokens_map.json | |
| [INFO|2024-12-04 14:10:38] logging.py:157 >> {'loss': 1.0425, 'learning_rate': 7.1996e-07, 'epoch': 2.76, 'throughput': 14459.94} | |
| [INFO|2024-12-04 14:11:17] logging.py:157 >> {'loss': 1.0649, 'learning_rate': 3.5960e-07, 'epoch': 2.83, 'throughput': 14155.22} | |
| [INFO|2024-12-04 14:11:59] logging.py:157 >> {'loss': 1.0439, 'learning_rate': 1.2256e-07, 'epoch': 2.90, 'throughput': 13843.87} | |
| [INFO|2024-12-04 14:12:38] logging.py:157 >> {'loss': 1.0382, 'learning_rate': 1.0012e-08, 'epoch': 2.96, 'throughput': 13593.13} | |
| [INFO|2024-12-04 14:12:54] trainer.py:3503 >> Saving model checkpoint to saves/Llama-3.2-3B-Instruct/lora/train_llama-pii/checkpoint-222 | |
| [INFO|2024-12-04 14:12:54] configuration_utils.py:733 >> loading configuration file config.json from cache at /home/jiaheng/.cache/huggingface/hub/models--meta-llama--Llama-3.2-3B-Instruct/snapshots/0cb88a4f764b7a12671c53f0838cd831a0843b95/config.json | |
| [INFO|2024-12-04 14:12:54] configuration_utils.py:800 >> Model config LlamaConfig { | |
| "architectures": [ | |
| "LlamaForCausalLM" | |
| ], | |
| "attention_bias": false, | |
| "attention_dropout": 0.0, | |
| "bos_token_id": 128000, | |
| "eos_token_id": [ | |
| 128001, | |
| 128008, | |
| 128009 | |
| ], | |
| "head_dim": 128, | |
| "hidden_act": "silu", | |
| "hidden_size": 3072, | |
| "initializer_range": 0.02, | |
| "intermediate_size": 8192, | |
| "max_position_embeddings": 131072, | |
| "mlp_bias": false, | |
| "model_type": "llama", | |
| "num_attention_heads": 24, | |
| "num_hidden_layers": 28, | |
| "num_key_value_heads": 8, | |
| "pretraining_tp": 1, | |
| "rms_norm_eps": 1e-05, | |
| "rope_scaling": { | |
| "factor": 32.0, | |
| "high_freq_factor": 4.0, | |
| "low_freq_factor": 1.0, | |
| "original_max_position_embeddings": 8192, | |
| "rope_type": "llama3" | |
| }, | |
| "rope_theta": 500000.0, | |
| "tie_word_embeddings": true, | |
| "torch_dtype": "bfloat16", | |
| "transformers_version": "4.43.1", | |
| "use_cache": true, | |
| "vocab_size": 128256 | |
| } | |
| [INFO|2024-12-04 14:12:55] tokenization_utils_base.py:2702 >> tokenizer config file saved in saves/Llama-3.2-3B-Instruct/lora/train_llama-pii/checkpoint-222/tokenizer_config.json | |
| [INFO|2024-12-04 14:12:55] tokenization_utils_base.py:2711 >> Special tokens file saved in saves/Llama-3.2-3B-Instruct/lora/train_llama-pii/checkpoint-222/special_tokens_map.json | |
| [INFO|2024-12-04 14:12:55] trainer.py:2394 >> | |
| Training completed. Do not forget to share your model on huggingface.co/models =) | |
| [INFO|2024-12-04 14:12:55] trainer.py:3503 >> Saving model checkpoint to saves/Llama-3.2-3B-Instruct/lora/train_llama-pii | |
| [INFO|2024-12-04 14:12:55] configuration_utils.py:733 >> loading configuration file config.json from cache at /home/jiaheng/.cache/huggingface/hub/models--meta-llama--Llama-3.2-3B-Instruct/snapshots/0cb88a4f764b7a12671c53f0838cd831a0843b95/config.json | |
| [INFO|2024-12-04 14:12:55] configuration_utils.py:800 >> Model config LlamaConfig { | |
| "architectures": [ | |
| "LlamaForCausalLM" | |
| ], | |
| "attention_bias": false, | |
| "attention_dropout": 0.0, | |
| "bos_token_id": 128000, | |
| "eos_token_id": [ | |
| 128001, | |
| 128008, | |
| 128009 | |
| ], | |
| "head_dim": 128, | |
| "hidden_act": "silu", | |
| "hidden_size": 3072, | |
| "initializer_range": 0.02, | |
| "intermediate_size": 8192, | |
| "max_position_embeddings": 131072, | |
| "mlp_bias": false, | |
| "model_type": "llama", | |
| "num_attention_heads": 24, | |
| "num_hidden_layers": 28, | |
| "num_key_value_heads": 8, | |
| "pretraining_tp": 1, | |
| "rms_norm_eps": 1e-05, | |
| "rope_scaling": { | |
| "factor": 32.0, | |
| "high_freq_factor": 4.0, | |
| "low_freq_factor": 1.0, | |
| "original_max_position_embeddings": 8192, | |
| "rope_type": "llama3" | |
| }, | |
| "rope_theta": 500000.0, | |
| "tie_word_embeddings": true, | |
| "torch_dtype": "bfloat16", | |
| "transformers_version": "4.43.1", | |
| "use_cache": true, | |
| "vocab_size": 128256 | |
| } | |
| [INFO|2024-12-04 14:12:55] tokenization_utils_base.py:2702 >> tokenizer config file saved in saves/Llama-3.2-3B-Instruct/lora/train_llama-pii/tokenizer_config.json | |
| [INFO|2024-12-04 14:12:55] tokenization_utils_base.py:2711 >> Special tokens file saved in saves/Llama-3.2-3B-Instruct/lora/train_llama-pii/special_tokens_map.json | |
| [WARNING|2024-12-04 14:12:55] logging.py:162 >> No metric eval_loss to plot. | |
| [WARNING|2024-12-04 14:12:55] logging.py:162 >> No metric eval_accuracy to plot. | |
| [INFO|2024-12-04 14:12:55] modelcard.py:449 >> Dropping the following result as it does not have all the necessary fields: | |
| {'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}} | |