Bulk update: multiple file changes

Browse files

Files changed (4) hide show

README.md +11 -16
config.json +2 -2
tokenization_indictrans.py +2 -4
tokenizer_config.json +2 -2

README.md CHANGED Viewed

@@ -5,7 +5,7 @@ license: mit
 These models are created from their respective IndicTrans2 parent versions by simplying replacing the Sinusoidal Positional Embedding with Rotary Positional Embedding ([Su _et al._](https://arxiv.org/abs/2104.09864)), and finetuning them for further alignment.
 *NOTE*:
-These models are my independent reproduction of the paper: [Towards Inducing Document-Level Abilities in Standard Multilingual Neural Machine Translation Models](https://arxiv.org/abs/2408.11382).
 Detailed information on the data mixture, hyperparameters, and training curriculum can be found in the paper.
@@ -21,7 +21,7 @@ from IndicTransToolkit import IndicProcessor
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 warnings.filterwarnings("ignore")
-model_name = "prajdabre/rotary-indictrans2-dist-indic-en-200M"
 device = "cuda" if torch.cuda.is_available() else "cpu"
 sentences = [
@@ -43,8 +43,6 @@ batch = tokenizer(
     batch, padding="longest", truncation=True, max_length=2048, return_tensors="pt"
 ).to(device)
-print(batch)
 with torch.inference_mode():
     outputs = model.generate(
         **batch,
@@ -56,23 +54,23 @@ with torch.inference_mode():
         early_stopping=True
     )
-with tokenizer.as_target_tokenizer():
-    outputs = tokenizer.batch_decode(
-        outputs, skip_special_tokens=True, clean_up_tokenization_spaces=True
-    )
 outputs = ip.postprocess_batch(outputs, lang="eng_Latn")
-print("| > Translations:", outputs[0])
 ```
 # Citation
 If you use these models directly or fine-tune them further for additional use cases, please cite the following work:
 ```bibtex
-@misc{gumma2024inducingdocumentlevelabilitiesstandard,
-      title={Towards Inducing Document-Level Abilities in Standard Multilingual Neural Machine Translation Models},
       author={Varun Gumma and Pranjal A. Chitale and Kalika Bali},
-      year={2024},
       eprint={2408.11382},
       archivePrefix={arXiv},
       primaryClass={cs.CL},
@@ -81,7 +79,4 @@ If you use these models directly or fine-tune them further for additional use ca
 ```
 # Warning
-Occasionally, you may notice some variation in the output, which may not be optimal. In such cases, you can experiment with adjusting the `num_beams`, `repetition_penalty`, and `length_penalty` parameters in the `generation_config`. Based on standard testing, the example with an input size of 1457 can be run on a single A100 GPU. However, the 1B model might require more compute resources or a lower beam size for generation.

 These models are created from their respective IndicTrans2 parent versions by simplying replacing the Sinusoidal Positional Embedding with Rotary Positional Embedding ([Su _et al._](https://arxiv.org/abs/2104.09864)), and finetuning them for further alignment.
 *NOTE*:
+These models are my independent reproduction of the paper: [Towards Inducing Long-Context Abilities in Multilingual Neural Machine Translation Models](https://arxiv.org/abs/2408.11382).
 Detailed information on the data mixture, hyperparameters, and training curriculum can be found in the paper.
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 warnings.filterwarnings("ignore")
+model_name = "prajdabre/rotary-indictrans2-indic-en-dist-200M"
 device = "cuda" if torch.cuda.is_available() else "cpu"
 sentences = [
     batch, padding="longest", truncation=True, max_length=2048, return_tensors="pt"
 ).to(device)
 with torch.inference_mode():
     outputs = model.generate(
         **batch,
         early_stopping=True
     )
+# no target_tokenizer scoping is required anymore
+outputs = tokenizer.batch_decode(
+    outputs, skip_special_tokens=True, clean_up_tokenization_spaces=True
+)
 outputs = ip.postprocess_batch(outputs, lang="eng_Latn")
+print(" | > Translations:", outputs[0])
 ```
 # Citation
 If you use these models directly or fine-tune them further for additional use cases, please cite the following work:
 ```bibtex
+@misc{gumma2025inducinglongcontextabilitiesmultilingual,
+      title={Towards Inducing Long-Context Abilities in Multilingual Neural Machine Translation Models},
       author={Varun Gumma and Pranjal A. Chitale and Kalika Bali},
+      year={2025},
       eprint={2408.11382},
       archivePrefix={arXiv},
       primaryClass={cs.CL},
 ```
 # Warning
+Occasionally, you may notice some variation in the output, which may not be optimal. In such cases, you can experiment with adjusting the `num_beams`, `repetition_penalty`, and `length_penalty` parameters in the `generation_config`. Based on standard testing, the example with an input size of 1457 can be run on a single A100 GPU. However, the 1B model might require more compute resources or a lower beam size for generation.

config.json CHANGED Viewed

@@ -15,7 +15,7 @@
   "decoder_normalize_before": true,
   "decoder_start_token_id": 2,
   "decoder_vocab_size": 32296,
-  "dropout": 0.2,
   "encoder_attention_heads": 8,
   "encoder_embed_dim": 512,
   "encoder_ffn_dim": 2048,
@@ -38,7 +38,7 @@
   "torch_dtype": "float32",
   "transformers_version": "4.44.0",
   "use_cache": true,
-  "_name_or_path": "VarunGumma/rotary-indictrans2-indic-en-dist-200M",
   "auto_map": {
     "AutoConfig": "configuration_rotary_indictrans.RotaryIndicTransConfig",
     "AutoModelForSeq2SeqLM": "modeling_rotary_indictrans.RotaryIndicTransForConditionalGeneration"

   "decoder_normalize_before": true,
   "decoder_start_token_id": 2,
   "decoder_vocab_size": 32296,
+  "dropout": 0.0,
   "encoder_attention_heads": 8,
   "encoder_embed_dim": 512,
   "encoder_ffn_dim": 2048,
   "torch_dtype": "float32",
   "transformers_version": "4.44.0",
   "use_cache": true,
+  "name_or_path": "prajdabre/rotary-indictrans2-indic-en-dist-200M",
   "auto_map": {
     "AutoConfig": "configuration_rotary_indictrans.RotaryIndicTransConfig",
     "AutoModelForSeq2SeqLM": "modeling_rotary_indictrans.RotaryIndicTransForConditionalGeneration"

tokenization_indictrans.py CHANGED Viewed

@@ -128,7 +128,7 @@ class IndicTransTokenizer(PreTrainedTokenizer):
         super().__init__(
             src_vocab_file=self.src_vocab_fp,
-            tgt_vocab_file=self.src_vocab_fp,
             do_lower_case=do_lower_case,
             unk_token=unk_token,
             bos_token=bos_token,
@@ -190,11 +190,9 @@ class IndicTransTokenizer(PreTrainedTokenizer):
     def vocab_size(self) -> int:
         return self.src_vocab_size
-    @lru_cache(maxsize=10240)
     def _convert_token_to_id(self, token: str) -> int:
         return self.encoder.get(token, self.unk_token_id)
-    @lru_cache(maxsize=10240)
     def _convert_id_to_token(self, index: int) -> str:
         return self.decoder.get(index, self.unk_token)
@@ -251,4 +249,4 @@ class IndicTransTokenizer(PreTrainedTokenizer):
             with open(fp, "wb") as f:
                 f.write(spm.serialized_model_proto())
-        return src_vocab_fp, tgt_vocab_fp, src_spm_fp, tgt_spm_fp

         super().__init__(
             src_vocab_file=self.src_vocab_fp,
+            tgt_vocab_file=self.tgt_vocab_fp,
             do_lower_case=do_lower_case,
             unk_token=unk_token,
             bos_token=bos_token,
     def vocab_size(self) -> int:
         return self.src_vocab_size
     def _convert_token_to_id(self, token: str) -> int:
         return self.encoder.get(token, self.unk_token_id)
     def _convert_id_to_token(self, index: int) -> str:
         return self.decoder.get(index, self.unk_token)
             with open(fp, "wb") as f:
                 f.write(spm.serialized_model_proto())
+        return src_vocab_fp, tgt_vocab_fp, src_spm_fp, tgt_spm_fp

tokenizer_config.json CHANGED Viewed

@@ -37,9 +37,9 @@
   "clean_up_tokenization_spaces": true,
   "do_lower_case": false,
   "eos_token": "</s>",
-  "model_max_length": 256,
   "pad_token": "<pad>",
-  "name_or_path": "ai4bharat/indictrans2-en-indic-1B",
   "tokenizer_class": "IndicTransTokenizer",
   "auto_map": {
     "AutoTokenizer": [

   "clean_up_tokenization_spaces": true,
   "do_lower_case": false,
   "eos_token": "</s>",
+  "model_max_length": 4096,
   "pad_token": "<pad>",
+  "name_or_path": "prajdabre/rotary-indictrans2-indic-en-dist-200M",
   "tokenizer_class": "IndicTransTokenizer",
   "auto_map": {
     "AutoTokenizer": [