prajdabre commited on
Commit
0711dc7
·
verified ·
1 Parent(s): c796d39

Bulk update: multiple file changes

Browse files
Files changed (4) hide show
  1. README.md +11 -16
  2. config.json +2 -2
  3. tokenization_indictrans.py +2 -4
  4. tokenizer_config.json +2 -2
README.md CHANGED
@@ -5,7 +5,7 @@ license: mit
5
  These models are created from their respective IndicTrans2 parent versions by simplying replacing the Sinusoidal Positional Embedding with Rotary Positional Embedding ([Su _et al._](https://arxiv.org/abs/2104.09864)), and finetuning them for further alignment.
6
 
7
  *NOTE*:
8
- These models are my independent reproduction of the paper: [Towards Inducing Document-Level Abilities in Standard Multilingual Neural Machine Translation Models](https://arxiv.org/abs/2408.11382).
9
 
10
  Detailed information on the data mixture, hyperparameters, and training curriculum can be found in the paper.
11
 
@@ -21,7 +21,7 @@ from IndicTransToolkit import IndicProcessor
21
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
22
 
23
  warnings.filterwarnings("ignore")
24
- model_name = "prajdabre/rotary-indictrans2-dist-indic-en-200M"
25
  device = "cuda" if torch.cuda.is_available() else "cpu"
26
 
27
  sentences = [
@@ -43,8 +43,6 @@ batch = tokenizer(
43
  batch, padding="longest", truncation=True, max_length=2048, return_tensors="pt"
44
  ).to(device)
45
 
46
- print(batch)
47
-
48
  with torch.inference_mode():
49
  outputs = model.generate(
50
  **batch,
@@ -56,23 +54,23 @@ with torch.inference_mode():
56
  early_stopping=True
57
  )
58
 
59
- with tokenizer.as_target_tokenizer():
60
- outputs = tokenizer.batch_decode(
61
- outputs, skip_special_tokens=True, clean_up_tokenization_spaces=True
62
- )
63
 
64
  outputs = ip.postprocess_batch(outputs, lang="eng_Latn")
65
- print("| > Translations:", outputs[0])
66
  ```
67
 
68
  # Citation
69
  If you use these models directly or fine-tune them further for additional use cases, please cite the following work:
70
 
71
  ```bibtex
72
- @misc{gumma2024inducingdocumentlevelabilitiesstandard,
73
- title={Towards Inducing Document-Level Abilities in Standard Multilingual Neural Machine Translation Models},
74
  author={Varun Gumma and Pranjal A. Chitale and Kalika Bali},
75
- year={2024},
76
  eprint={2408.11382},
77
  archivePrefix={arXiv},
78
  primaryClass={cs.CL},
@@ -81,7 +79,4 @@ If you use these models directly or fine-tune them further for additional use ca
81
  ```
82
 
83
  # Warning
84
- Occasionally, you may notice some variation in the output, which may not be optimal. In such cases, you can experiment with adjusting the `num_beams`, `repetition_penalty`, and `length_penalty` parameters in the `generation_config`. Based on standard testing, the example with an input size of 1457 can be run on a single A100 GPU. However, the 1B model might require more compute resources or a lower beam size for generation.
85
-
86
-
87
-
 
5
  These models are created from their respective IndicTrans2 parent versions by simplying replacing the Sinusoidal Positional Embedding with Rotary Positional Embedding ([Su _et al._](https://arxiv.org/abs/2104.09864)), and finetuning them for further alignment.
6
 
7
  *NOTE*:
8
+ These models are my independent reproduction of the paper: [Towards Inducing Long-Context Abilities in Multilingual Neural Machine Translation Models](https://arxiv.org/abs/2408.11382).
9
 
10
  Detailed information on the data mixture, hyperparameters, and training curriculum can be found in the paper.
11
 
 
21
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
22
 
23
  warnings.filterwarnings("ignore")
24
+ model_name = "prajdabre/rotary-indictrans2-indic-en-dist-200M"
25
  device = "cuda" if torch.cuda.is_available() else "cpu"
26
 
27
  sentences = [
 
43
  batch, padding="longest", truncation=True, max_length=2048, return_tensors="pt"
44
  ).to(device)
45
 
 
 
46
  with torch.inference_mode():
47
  outputs = model.generate(
48
  **batch,
 
54
  early_stopping=True
55
  )
56
 
57
+ # no target_tokenizer scoping is required anymore
58
+ outputs = tokenizer.batch_decode(
59
+ outputs, skip_special_tokens=True, clean_up_tokenization_spaces=True
60
+ )
61
 
62
  outputs = ip.postprocess_batch(outputs, lang="eng_Latn")
63
+ print(" | > Translations:", outputs[0])
64
  ```
65
 
66
  # Citation
67
  If you use these models directly or fine-tune them further for additional use cases, please cite the following work:
68
 
69
  ```bibtex
70
+ @misc{gumma2025inducinglongcontextabilitiesmultilingual,
71
+ title={Towards Inducing Long-Context Abilities in Multilingual Neural Machine Translation Models},
72
  author={Varun Gumma and Pranjal A. Chitale and Kalika Bali},
73
+ year={2025},
74
  eprint={2408.11382},
75
  archivePrefix={arXiv},
76
  primaryClass={cs.CL},
 
79
  ```
80
 
81
  # Warning
82
+ Occasionally, you may notice some variation in the output, which may not be optimal. In such cases, you can experiment with adjusting the `num_beams`, `repetition_penalty`, and `length_penalty` parameters in the `generation_config`. Based on standard testing, the example with an input size of 1457 can be run on a single A100 GPU. However, the 1B model might require more compute resources or a lower beam size for generation.
 
 
 
config.json CHANGED
@@ -15,7 +15,7 @@
15
  "decoder_normalize_before": true,
16
  "decoder_start_token_id": 2,
17
  "decoder_vocab_size": 32296,
18
- "dropout": 0.2,
19
  "encoder_attention_heads": 8,
20
  "encoder_embed_dim": 512,
21
  "encoder_ffn_dim": 2048,
@@ -38,7 +38,7 @@
38
  "torch_dtype": "float32",
39
  "transformers_version": "4.44.0",
40
  "use_cache": true,
41
- "_name_or_path": "VarunGumma/rotary-indictrans2-indic-en-dist-200M",
42
  "auto_map": {
43
  "AutoConfig": "configuration_rotary_indictrans.RotaryIndicTransConfig",
44
  "AutoModelForSeq2SeqLM": "modeling_rotary_indictrans.RotaryIndicTransForConditionalGeneration"
 
15
  "decoder_normalize_before": true,
16
  "decoder_start_token_id": 2,
17
  "decoder_vocab_size": 32296,
18
+ "dropout": 0.0,
19
  "encoder_attention_heads": 8,
20
  "encoder_embed_dim": 512,
21
  "encoder_ffn_dim": 2048,
 
38
  "torch_dtype": "float32",
39
  "transformers_version": "4.44.0",
40
  "use_cache": true,
41
+ "name_or_path": "prajdabre/rotary-indictrans2-indic-en-dist-200M",
42
  "auto_map": {
43
  "AutoConfig": "configuration_rotary_indictrans.RotaryIndicTransConfig",
44
  "AutoModelForSeq2SeqLM": "modeling_rotary_indictrans.RotaryIndicTransForConditionalGeneration"
tokenization_indictrans.py CHANGED
@@ -128,7 +128,7 @@ class IndicTransTokenizer(PreTrainedTokenizer):
128
 
129
  super().__init__(
130
  src_vocab_file=self.src_vocab_fp,
131
- tgt_vocab_file=self.src_vocab_fp,
132
  do_lower_case=do_lower_case,
133
  unk_token=unk_token,
134
  bos_token=bos_token,
@@ -190,11 +190,9 @@ class IndicTransTokenizer(PreTrainedTokenizer):
190
  def vocab_size(self) -> int:
191
  return self.src_vocab_size
192
 
193
- @lru_cache(maxsize=10240)
194
  def _convert_token_to_id(self, token: str) -> int:
195
  return self.encoder.get(token, self.unk_token_id)
196
 
197
- @lru_cache(maxsize=10240)
198
  def _convert_id_to_token(self, index: int) -> str:
199
  return self.decoder.get(index, self.unk_token)
200
 
@@ -251,4 +249,4 @@ class IndicTransTokenizer(PreTrainedTokenizer):
251
  with open(fp, "wb") as f:
252
  f.write(spm.serialized_model_proto())
253
 
254
- return src_vocab_fp, tgt_vocab_fp, src_spm_fp, tgt_spm_fp
 
128
 
129
  super().__init__(
130
  src_vocab_file=self.src_vocab_fp,
131
+ tgt_vocab_file=self.tgt_vocab_fp,
132
  do_lower_case=do_lower_case,
133
  unk_token=unk_token,
134
  bos_token=bos_token,
 
190
  def vocab_size(self) -> int:
191
  return self.src_vocab_size
192
 
 
193
  def _convert_token_to_id(self, token: str) -> int:
194
  return self.encoder.get(token, self.unk_token_id)
195
 
 
196
  def _convert_id_to_token(self, index: int) -> str:
197
  return self.decoder.get(index, self.unk_token)
198
 
 
249
  with open(fp, "wb") as f:
250
  f.write(spm.serialized_model_proto())
251
 
252
+ return src_vocab_fp, tgt_vocab_fp, src_spm_fp, tgt_spm_fp
tokenizer_config.json CHANGED
@@ -37,9 +37,9 @@
37
  "clean_up_tokenization_spaces": true,
38
  "do_lower_case": false,
39
  "eos_token": "</s>",
40
- "model_max_length": 256,
41
  "pad_token": "<pad>",
42
- "name_or_path": "ai4bharat/indictrans2-en-indic-1B",
43
  "tokenizer_class": "IndicTransTokenizer",
44
  "auto_map": {
45
  "AutoTokenizer": [
 
37
  "clean_up_tokenization_spaces": true,
38
  "do_lower_case": false,
39
  "eos_token": "</s>",
40
+ "model_max_length": 4096,
41
  "pad_token": "<pad>",
42
+ "name_or_path": "prajdabre/rotary-indictrans2-indic-en-dist-200M",
43
  "tokenizer_class": "IndicTransTokenizer",
44
  "auto_map": {
45
  "AutoTokenizer": [