Update config file and model.safetensors

Files changed (6) hide show

README.md CHANGED Viewed

@@ -78,5 +78,8 @@ If you use these models directly or fine-tune them further for additional use ca
 }
 ```
 # Warning
 Occasionally, you may notice some variation in the output, which may not be optimal. In such cases, you can experiment with adjusting the `num_beams`, `repetition_penalty`, and `length_penalty` parameters in the `generation_config`. Based on standard testing, the example with an input size of 1457 can be run on a single A100 GPU. However, the 1B model might require more compute resources or a lower beam size for generation.

 }
 ```
+# Note
+These new and improved models are primarily built and tested for document-level and long-context translations, and the performance of smaller sentence-level tasks might be sub-optimal, and might require generation parameter tuning. Please throughly verify the performance of the models for your usecase before scaling up generation.
 # Warning
 Occasionally, you may notice some variation in the output, which may not be optimal. In such cases, you can experiment with adjusting the `num_beams`, `repetition_penalty`, and `length_penalty` parameters in the `generation_config`. Based on standard testing, the example with an input size of 1457 can be run on a single A100 GPU. However, the 1B model might require more compute resources or a lower beam size for generation.

config.json CHANGED Viewed

@@ -10,7 +10,7 @@
   "decoder_attention_heads": 16,
   "decoder_embed_dim": 1024,
   "decoder_ffn_dim": 8192,
-  "decoder_layerdrop": 0,
   "decoder_layers": 18,
   "decoder_normalize_before": true,
   "decoder_start_token_id": 2,
@@ -19,7 +19,7 @@
   "encoder_attention_heads": 16,
   "encoder_embed_dim": 1024,
   "encoder_ffn_dim": 8192,
-  "encoder_layerdrop": 0,
   "encoder_layers": 18,
   "encoder_normalize_before": true,
   "encoder_vocab_size": 32322,
@@ -27,21 +27,18 @@
   "init_std": 0.02,
   "is_encoder_decoder": true,
   "layernorm_embedding": false,
   "model_type": "RotaryIndicTrans",
   "num_hidden_layers": 18,
   "pad_token_id": 1,
   "rope_args": {
-    "theta": 10000
   },
   "scale_embedding": true,
   "share_decoder_input_output_embed": false,
   "torch_dtype": "float32",
-  "transformers_version": "4.46.1",
   "use_cache": true,
-  "name_or_path": "prajdabre/rotary-indictrans2-en-indic-1B",
-  "auto_map": {
-    "AutoConfig": "configuration_rotary_indictrans.RotaryIndicTransConfig",
-    "AutoModelForSeq2SeqLM": "modeling_rotary_indictrans.RotaryIndicTransForConditionalGeneration"
-  },
-  "tokenizer_class": "IndicTransTokenizer"
-}

   "decoder_attention_heads": 16,
   "decoder_embed_dim": 1024,
   "decoder_ffn_dim": 8192,
+  "decoder_layerdrop": 0.0,
   "decoder_layers": 18,
   "decoder_normalize_before": true,
   "decoder_start_token_id": 2,
   "encoder_attention_heads": 16,
   "encoder_embed_dim": 1024,
   "encoder_ffn_dim": 8192,
+  "encoder_layerdrop": 0.0,
   "encoder_layers": 18,
   "encoder_normalize_before": true,
   "encoder_vocab_size": 32322,
   "init_std": 0.02,
   "is_encoder_decoder": true,
   "layernorm_embedding": false,
+  "max_source_positions": 8192,
+  "max_target_positions": 8192,
   "model_type": "RotaryIndicTrans",
   "num_hidden_layers": 18,
   "pad_token_id": 1,
   "rope_args": {
+    "theta": 50000
   },
   "scale_embedding": true,
   "share_decoder_input_output_embed": false,
   "torch_dtype": "float32",
+  "transformers_version": "4.47.1",
   "use_cache": true,
+  "vocab_size": 122672
+}

generation_config.json CHANGED Viewed

@@ -4,5 +4,5 @@
   "decoder_start_token_id": 2,
   "eos_token_id": 2,
   "pad_token_id": 1,
-  "transformers_version": "4.46.1"
 }

   "decoder_start_token_id": 2,
   "eos_token_id": 2,
   "pad_token_id": 1,
+  "transformers_version": "4.47.1"
 }

pytorch_model.bin → model.safetensors RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5f735add3c16827f4242d5724f87833b1f40ae096b6f74a8ac153b3fe741c2ce
-size 4462424386

 version https://git-lfs.github.com/spec/v1
+oid sha256:ccc09ebd3f942c279999571951cc8aae20a1f4d123a13b353dad9e2c42290721
+size 4462265272

modeling_rotary_indictrans.py CHANGED Viewed

@@ -108,7 +108,9 @@ class RotaryEmbedding(torch.nn.Module):
         self.max_seq_len = max_seq_len
         self.scaling_factor = scaling_factor
-        inv_freq_ = 1.0 / (theta ** (torch.arange(0, dim, 2, device=device).float() / dim))
         self.register_buffer("inv_freq", inv_freq_, persistent=False)
         self.precompute_freqs(max_seq_len)

         self.max_seq_len = max_seq_len
         self.scaling_factor = scaling_factor
+        inv_freq_ = 1.0 / (
+            theta ** (torch.arange(0, dim, 2, device=device).float() / dim)
+        )
         self.register_buffer("inv_freq", inv_freq_, persistent=False)
         self.precompute_freqs(max_seq_len)

tokenizer_config.json CHANGED Viewed

@@ -37,7 +37,7 @@
   "clean_up_tokenization_spaces": true,
   "do_lower_case": false,
   "eos_token": "</s>",
-  "model_max_length": 4096,
   "pad_token": "<pad>",
   "name_or_path": "prajdabre/rotary-indictrans2-en-indic-1B",
   "tokenizer_class": "IndicTransTokenizer",

   "clean_up_tokenization_spaces": true,
   "do_lower_case": false,
   "eos_token": "</s>",
+  "model_max_length": 8192,
   "pad_token": "<pad>",
   "name_or_path": "prajdabre/rotary-indictrans2-en-indic-1B",
   "tokenizer_class": "IndicTransTokenizer",