Update config file and model.safetensors
Browse files- README.md +3 -0
- config.json +8 -11
- generation_config.json +1 -1
- pytorch_model.bin → model.safetensors +2 -2
- modeling_rotary_indictrans.py +3 -1
- tokenizer_config.json +1 -1
README.md
CHANGED
|
@@ -78,5 +78,8 @@ If you use these models directly or fine-tune them further for additional use ca
|
|
| 78 |
}
|
| 79 |
```
|
| 80 |
|
|
|
|
|
|
|
|
|
|
| 81 |
# Warning
|
| 82 |
Occasionally, you may notice some variation in the output, which may not be optimal. In such cases, you can experiment with adjusting the `num_beams`, `repetition_penalty`, and `length_penalty` parameters in the `generation_config`. Based on standard testing, the example with an input size of 1457 can be run on a single A100 GPU. However, the 1B model might require more compute resources or a lower beam size for generation.
|
|
|
|
| 78 |
}
|
| 79 |
```
|
| 80 |
|
| 81 |
+
# Note
|
| 82 |
+
These new and improved models are primarily built and tested for document-level and long-context translations, and the performance of smaller sentence-level tasks might be sub-optimal, and might require generation parameter tuning. Please throughly verify the performance of the models for your usecase before scaling up generation.
|
| 83 |
+
|
| 84 |
# Warning
|
| 85 |
Occasionally, you may notice some variation in the output, which may not be optimal. In such cases, you can experiment with adjusting the `num_beams`, `repetition_penalty`, and `length_penalty` parameters in the `generation_config`. Based on standard testing, the example with an input size of 1457 can be run on a single A100 GPU. However, the 1B model might require more compute resources or a lower beam size for generation.
|
config.json
CHANGED
|
@@ -10,7 +10,7 @@
|
|
| 10 |
"decoder_attention_heads": 16,
|
| 11 |
"decoder_embed_dim": 1024,
|
| 12 |
"decoder_ffn_dim": 8192,
|
| 13 |
-
"decoder_layerdrop": 0,
|
| 14 |
"decoder_layers": 18,
|
| 15 |
"decoder_normalize_before": true,
|
| 16 |
"decoder_start_token_id": 2,
|
|
@@ -19,7 +19,7 @@
|
|
| 19 |
"encoder_attention_heads": 16,
|
| 20 |
"encoder_embed_dim": 1024,
|
| 21 |
"encoder_ffn_dim": 8192,
|
| 22 |
-
"encoder_layerdrop": 0,
|
| 23 |
"encoder_layers": 18,
|
| 24 |
"encoder_normalize_before": true,
|
| 25 |
"encoder_vocab_size": 32322,
|
|
@@ -27,21 +27,18 @@
|
|
| 27 |
"init_std": 0.02,
|
| 28 |
"is_encoder_decoder": true,
|
| 29 |
"layernorm_embedding": false,
|
|
|
|
|
|
|
| 30 |
"model_type": "RotaryIndicTrans",
|
| 31 |
"num_hidden_layers": 18,
|
| 32 |
"pad_token_id": 1,
|
| 33 |
"rope_args": {
|
| 34 |
-
"theta":
|
| 35 |
},
|
| 36 |
"scale_embedding": true,
|
| 37 |
"share_decoder_input_output_embed": false,
|
| 38 |
"torch_dtype": "float32",
|
| 39 |
-
"transformers_version": "4.
|
| 40 |
"use_cache": true,
|
| 41 |
-
"
|
| 42 |
-
|
| 43 |
-
"AutoConfig": "configuration_rotary_indictrans.RotaryIndicTransConfig",
|
| 44 |
-
"AutoModelForSeq2SeqLM": "modeling_rotary_indictrans.RotaryIndicTransForConditionalGeneration"
|
| 45 |
-
},
|
| 46 |
-
"tokenizer_class": "IndicTransTokenizer"
|
| 47 |
-
}
|
|
|
|
| 10 |
"decoder_attention_heads": 16,
|
| 11 |
"decoder_embed_dim": 1024,
|
| 12 |
"decoder_ffn_dim": 8192,
|
| 13 |
+
"decoder_layerdrop": 0.0,
|
| 14 |
"decoder_layers": 18,
|
| 15 |
"decoder_normalize_before": true,
|
| 16 |
"decoder_start_token_id": 2,
|
|
|
|
| 19 |
"encoder_attention_heads": 16,
|
| 20 |
"encoder_embed_dim": 1024,
|
| 21 |
"encoder_ffn_dim": 8192,
|
| 22 |
+
"encoder_layerdrop": 0.0,
|
| 23 |
"encoder_layers": 18,
|
| 24 |
"encoder_normalize_before": true,
|
| 25 |
"encoder_vocab_size": 32322,
|
|
|
|
| 27 |
"init_std": 0.02,
|
| 28 |
"is_encoder_decoder": true,
|
| 29 |
"layernorm_embedding": false,
|
| 30 |
+
"max_source_positions": 8192,
|
| 31 |
+
"max_target_positions": 8192,
|
| 32 |
"model_type": "RotaryIndicTrans",
|
| 33 |
"num_hidden_layers": 18,
|
| 34 |
"pad_token_id": 1,
|
| 35 |
"rope_args": {
|
| 36 |
+
"theta": 50000
|
| 37 |
},
|
| 38 |
"scale_embedding": true,
|
| 39 |
"share_decoder_input_output_embed": false,
|
| 40 |
"torch_dtype": "float32",
|
| 41 |
+
"transformers_version": "4.47.1",
|
| 42 |
"use_cache": true,
|
| 43 |
+
"vocab_size": 122672
|
| 44 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
generation_config.json
CHANGED
|
@@ -4,5 +4,5 @@
|
|
| 4 |
"decoder_start_token_id": 2,
|
| 5 |
"eos_token_id": 2,
|
| 6 |
"pad_token_id": 1,
|
| 7 |
-
"transformers_version": "4.
|
| 8 |
}
|
|
|
|
| 4 |
"decoder_start_token_id": 2,
|
| 5 |
"eos_token_id": 2,
|
| 6 |
"pad_token_id": 1,
|
| 7 |
+
"transformers_version": "4.47.1"
|
| 8 |
}
|
pytorch_model.bin → model.safetensors
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ccc09ebd3f942c279999571951cc8aae20a1f4d123a13b353dad9e2c42290721
|
| 3 |
+
size 4462265272
|
modeling_rotary_indictrans.py
CHANGED
|
@@ -108,7 +108,9 @@ class RotaryEmbedding(torch.nn.Module):
|
|
| 108 |
self.max_seq_len = max_seq_len
|
| 109 |
self.scaling_factor = scaling_factor
|
| 110 |
|
| 111 |
-
inv_freq_ = 1.0 / (
|
|
|
|
|
|
|
| 112 |
|
| 113 |
self.register_buffer("inv_freq", inv_freq_, persistent=False)
|
| 114 |
self.precompute_freqs(max_seq_len)
|
|
|
|
| 108 |
self.max_seq_len = max_seq_len
|
| 109 |
self.scaling_factor = scaling_factor
|
| 110 |
|
| 111 |
+
inv_freq_ = 1.0 / (
|
| 112 |
+
theta ** (torch.arange(0, dim, 2, device=device).float() / dim)
|
| 113 |
+
)
|
| 114 |
|
| 115 |
self.register_buffer("inv_freq", inv_freq_, persistent=False)
|
| 116 |
self.precompute_freqs(max_seq_len)
|
tokenizer_config.json
CHANGED
|
@@ -37,7 +37,7 @@
|
|
| 37 |
"clean_up_tokenization_spaces": true,
|
| 38 |
"do_lower_case": false,
|
| 39 |
"eos_token": "</s>",
|
| 40 |
-
"model_max_length":
|
| 41 |
"pad_token": "<pad>",
|
| 42 |
"name_or_path": "prajdabre/rotary-indictrans2-en-indic-1B",
|
| 43 |
"tokenizer_class": "IndicTransTokenizer",
|
|
|
|
| 37 |
"clean_up_tokenization_spaces": true,
|
| 38 |
"do_lower_case": false,
|
| 39 |
"eos_token": "</s>",
|
| 40 |
+
"model_max_length": 8192,
|
| 41 |
"pad_token": "<pad>",
|
| 42 |
"name_or_path": "prajdabre/rotary-indictrans2-en-indic-1B",
|
| 43 |
"tokenizer_class": "IndicTransTokenizer",
|