Bulk update: multiple file changes
Browse files- README.md +11 -16
- config.json +2 -2
- tokenization_indictrans.py +2 -4
- tokenizer_config.json +2 -2
README.md
CHANGED
@@ -5,7 +5,7 @@ license: mit
|
|
5 |
These models are created from their respective IndicTrans2 parent versions by simplying replacing the Sinusoidal Positional Embedding with Rotary Positional Embedding ([Su _et al._](https://arxiv.org/abs/2104.09864)), and finetuning them for further alignment.
|
6 |
|
7 |
*NOTE*:
|
8 |
-
These models are my independent reproduction of the paper: [Towards Inducing
|
9 |
|
10 |
Detailed information on the data mixture, hyperparameters, and training curriculum can be found in the paper.
|
11 |
|
@@ -21,7 +21,7 @@ from IndicTransToolkit import IndicProcessor
|
|
21 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
22 |
|
23 |
warnings.filterwarnings("ignore")
|
24 |
-
model_name = "prajdabre/rotary-indictrans2-
|
25 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
26 |
|
27 |
sentences = [
|
@@ -43,8 +43,6 @@ batch = tokenizer(
|
|
43 |
batch, padding="longest", truncation=True, max_length=2048, return_tensors="pt"
|
44 |
).to(device)
|
45 |
|
46 |
-
print(batch)
|
47 |
-
|
48 |
with torch.inference_mode():
|
49 |
outputs = model.generate(
|
50 |
**batch,
|
@@ -56,23 +54,23 @@ with torch.inference_mode():
|
|
56 |
early_stopping=True
|
57 |
)
|
58 |
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
|
64 |
outputs = ip.postprocess_batch(outputs, lang="eng_Latn")
|
65 |
-
print("| > Translations:", outputs[0])
|
66 |
```
|
67 |
|
68 |
# Citation
|
69 |
If you use these models directly or fine-tune them further for additional use cases, please cite the following work:
|
70 |
|
71 |
```bibtex
|
72 |
-
@misc{
|
73 |
-
title={Towards Inducing
|
74 |
author={Varun Gumma and Pranjal A. Chitale and Kalika Bali},
|
75 |
-
year={
|
76 |
eprint={2408.11382},
|
77 |
archivePrefix={arXiv},
|
78 |
primaryClass={cs.CL},
|
@@ -81,7 +79,4 @@ If you use these models directly or fine-tune them further for additional use ca
|
|
81 |
```
|
82 |
|
83 |
# Warning
|
84 |
-
Occasionally, you may notice some variation in the output, which may not be optimal. In such cases, you can experiment with adjusting the `num_beams`, `repetition_penalty`, and `length_penalty` parameters in the `generation_config`. Based on standard testing, the example with an input size of 1457 can be run on a single A100 GPU. However, the 1B model might require more compute resources or a lower beam size for generation.
|
85 |
-
|
86 |
-
|
87 |
-
|
|
|
5 |
These models are created from their respective IndicTrans2 parent versions by simplying replacing the Sinusoidal Positional Embedding with Rotary Positional Embedding ([Su _et al._](https://arxiv.org/abs/2104.09864)), and finetuning them for further alignment.
|
6 |
|
7 |
*NOTE*:
|
8 |
+
These models are my independent reproduction of the paper: [Towards Inducing Long-Context Abilities in Multilingual Neural Machine Translation Models](https://arxiv.org/abs/2408.11382).
|
9 |
|
10 |
Detailed information on the data mixture, hyperparameters, and training curriculum can be found in the paper.
|
11 |
|
|
|
21 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
22 |
|
23 |
warnings.filterwarnings("ignore")
|
24 |
+
model_name = "prajdabre/rotary-indictrans2-indic-en-dist-200M"
|
25 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
26 |
|
27 |
sentences = [
|
|
|
43 |
batch, padding="longest", truncation=True, max_length=2048, return_tensors="pt"
|
44 |
).to(device)
|
45 |
|
|
|
|
|
46 |
with torch.inference_mode():
|
47 |
outputs = model.generate(
|
48 |
**batch,
|
|
|
54 |
early_stopping=True
|
55 |
)
|
56 |
|
57 |
+
# no target_tokenizer scoping is required anymore
|
58 |
+
outputs = tokenizer.batch_decode(
|
59 |
+
outputs, skip_special_tokens=True, clean_up_tokenization_spaces=True
|
60 |
+
)
|
61 |
|
62 |
outputs = ip.postprocess_batch(outputs, lang="eng_Latn")
|
63 |
+
print(" | > Translations:", outputs[0])
|
64 |
```
|
65 |
|
66 |
# Citation
|
67 |
If you use these models directly or fine-tune them further for additional use cases, please cite the following work:
|
68 |
|
69 |
```bibtex
|
70 |
+
@misc{gumma2025inducinglongcontextabilitiesmultilingual,
|
71 |
+
title={Towards Inducing Long-Context Abilities in Multilingual Neural Machine Translation Models},
|
72 |
author={Varun Gumma and Pranjal A. Chitale and Kalika Bali},
|
73 |
+
year={2025},
|
74 |
eprint={2408.11382},
|
75 |
archivePrefix={arXiv},
|
76 |
primaryClass={cs.CL},
|
|
|
79 |
```
|
80 |
|
81 |
# Warning
|
82 |
+
Occasionally, you may notice some variation in the output, which may not be optimal. In such cases, you can experiment with adjusting the `num_beams`, `repetition_penalty`, and `length_penalty` parameters in the `generation_config`. Based on standard testing, the example with an input size of 1457 can be run on a single A100 GPU. However, the 1B model might require more compute resources or a lower beam size for generation.
|
|
|
|
|
|
config.json
CHANGED
@@ -15,7 +15,7 @@
|
|
15 |
"decoder_normalize_before": true,
|
16 |
"decoder_start_token_id": 2,
|
17 |
"decoder_vocab_size": 32296,
|
18 |
-
"dropout": 0.
|
19 |
"encoder_attention_heads": 8,
|
20 |
"encoder_embed_dim": 512,
|
21 |
"encoder_ffn_dim": 2048,
|
@@ -38,7 +38,7 @@
|
|
38 |
"torch_dtype": "float32",
|
39 |
"transformers_version": "4.44.0",
|
40 |
"use_cache": true,
|
41 |
-
"
|
42 |
"auto_map": {
|
43 |
"AutoConfig": "configuration_rotary_indictrans.RotaryIndicTransConfig",
|
44 |
"AutoModelForSeq2SeqLM": "modeling_rotary_indictrans.RotaryIndicTransForConditionalGeneration"
|
|
|
15 |
"decoder_normalize_before": true,
|
16 |
"decoder_start_token_id": 2,
|
17 |
"decoder_vocab_size": 32296,
|
18 |
+
"dropout": 0.0,
|
19 |
"encoder_attention_heads": 8,
|
20 |
"encoder_embed_dim": 512,
|
21 |
"encoder_ffn_dim": 2048,
|
|
|
38 |
"torch_dtype": "float32",
|
39 |
"transformers_version": "4.44.0",
|
40 |
"use_cache": true,
|
41 |
+
"name_or_path": "prajdabre/rotary-indictrans2-indic-en-dist-200M",
|
42 |
"auto_map": {
|
43 |
"AutoConfig": "configuration_rotary_indictrans.RotaryIndicTransConfig",
|
44 |
"AutoModelForSeq2SeqLM": "modeling_rotary_indictrans.RotaryIndicTransForConditionalGeneration"
|
tokenization_indictrans.py
CHANGED
@@ -128,7 +128,7 @@ class IndicTransTokenizer(PreTrainedTokenizer):
|
|
128 |
|
129 |
super().__init__(
|
130 |
src_vocab_file=self.src_vocab_fp,
|
131 |
-
tgt_vocab_file=self.
|
132 |
do_lower_case=do_lower_case,
|
133 |
unk_token=unk_token,
|
134 |
bos_token=bos_token,
|
@@ -190,11 +190,9 @@ class IndicTransTokenizer(PreTrainedTokenizer):
|
|
190 |
def vocab_size(self) -> int:
|
191 |
return self.src_vocab_size
|
192 |
|
193 |
-
@lru_cache(maxsize=10240)
|
194 |
def _convert_token_to_id(self, token: str) -> int:
|
195 |
return self.encoder.get(token, self.unk_token_id)
|
196 |
|
197 |
-
@lru_cache(maxsize=10240)
|
198 |
def _convert_id_to_token(self, index: int) -> str:
|
199 |
return self.decoder.get(index, self.unk_token)
|
200 |
|
@@ -251,4 +249,4 @@ class IndicTransTokenizer(PreTrainedTokenizer):
|
|
251 |
with open(fp, "wb") as f:
|
252 |
f.write(spm.serialized_model_proto())
|
253 |
|
254 |
-
return src_vocab_fp, tgt_vocab_fp, src_spm_fp, tgt_spm_fp
|
|
|
128 |
|
129 |
super().__init__(
|
130 |
src_vocab_file=self.src_vocab_fp,
|
131 |
+
tgt_vocab_file=self.tgt_vocab_fp,
|
132 |
do_lower_case=do_lower_case,
|
133 |
unk_token=unk_token,
|
134 |
bos_token=bos_token,
|
|
|
190 |
def vocab_size(self) -> int:
|
191 |
return self.src_vocab_size
|
192 |
|
|
|
193 |
def _convert_token_to_id(self, token: str) -> int:
|
194 |
return self.encoder.get(token, self.unk_token_id)
|
195 |
|
|
|
196 |
def _convert_id_to_token(self, index: int) -> str:
|
197 |
return self.decoder.get(index, self.unk_token)
|
198 |
|
|
|
249 |
with open(fp, "wb") as f:
|
250 |
f.write(spm.serialized_model_proto())
|
251 |
|
252 |
+
return src_vocab_fp, tgt_vocab_fp, src_spm_fp, tgt_spm_fp
|
tokenizer_config.json
CHANGED
@@ -37,9 +37,9 @@
|
|
37 |
"clean_up_tokenization_spaces": true,
|
38 |
"do_lower_case": false,
|
39 |
"eos_token": "</s>",
|
40 |
-
"model_max_length":
|
41 |
"pad_token": "<pad>",
|
42 |
-
"name_or_path": "
|
43 |
"tokenizer_class": "IndicTransTokenizer",
|
44 |
"auto_map": {
|
45 |
"AutoTokenizer": [
|
|
|
37 |
"clean_up_tokenization_spaces": true,
|
38 |
"do_lower_case": false,
|
39 |
"eos_token": "</s>",
|
40 |
+
"model_max_length": 4096,
|
41 |
"pad_token": "<pad>",
|
42 |
+
"name_or_path": "prajdabre/rotary-indictrans2-indic-en-dist-200M",
|
43 |
"tokenizer_class": "IndicTransTokenizer",
|
44 |
"auto_map": {
|
45 |
"AutoTokenizer": [
|