futranbg commited on
Commit
307936e
1 Parent(s): a7dc181

Create vinai_translator.py

Browse files
Files changed (1) hide show
  1. vinai_translator.py +94 -0
vinai_translator.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
3
+
4
+ dict_map = {
5
+ "òa": "oà",
6
+ "Òa": "Oà",
7
+ "ÒA": "OÀ",
8
+ "óa": "oá",
9
+ "Óa": "Oá",
10
+ "ÓA": "OÁ",
11
+ "ỏa": "oả",
12
+ "Ỏa": "Oả",
13
+ "ỎA": "OẢ",
14
+ "õa": "oã",
15
+ "Õa": "Oã",
16
+ "ÕA": "OÃ",
17
+ "ọa": "oạ",
18
+ "Ọa": "Oạ",
19
+ "ỌA": "OẠ",
20
+ "òe": "oè",
21
+ "Òe": "Oè",
22
+ "ÒE": "OÈ",
23
+ "óe": "oé",
24
+ "Óe": "Oé",
25
+ "ÓE": "OÉ",
26
+ "ỏe": "oẻ",
27
+ "Ỏe": "Oẻ",
28
+ "ỎE": "OẺ",
29
+ "õe": "oẽ",
30
+ "Õe": "Oẽ",
31
+ "ÕE": "OẼ",
32
+ "ọe": "oẹ",
33
+ "Ọe": "Oẹ",
34
+ "ỌE": "OẸ",
35
+ "ùy": "uỳ",
36
+ "Ùy": "Uỳ",
37
+ "ÙY": "UỲ",
38
+ "úy": "uý",
39
+ "Úy": "Uý",
40
+ "ÚY": "UÝ",
41
+ "ủy": "uỷ",
42
+ "Ủy": "Uỷ",
43
+ "ỦY": "UỶ",
44
+ "ũy": "uỹ",
45
+ "Ũy": "Uỹ",
46
+ "ŨY": "UỸ",
47
+ "ụy": "uỵ",
48
+ "Ụy": "Uỵ",
49
+ "ỤY": "UỴ",
50
+ }
51
+
52
+ tokenizer_vi2en = AutoTokenizer.from_pretrained("vinai/vinai-translate-vi2en", src_lang="vi_VN")
53
+ model_vi2en = AutoModelForSeq2SeqLM.from_pretrained("vinai/vinai-translate-vi2en")
54
+
55
+ def translate_vi2en(vi_text: str) -> str:
56
+ for i, j in dict_map.items():
57
+ vi_text = vi_text.replace(i, j)
58
+ input_ids = tokenizer_vi2en(vi_text, return_tensors="pt").input_ids
59
+ output_ids = model_vi2en.generate(
60
+ input_ids,
61
+ decoder_start_token_id=tokenizer_vi2en.lang_code_to_id["en_XX"],
62
+ num_return_sequences=1,
63
+ # # With sampling
64
+ # do_sample=True,
65
+ # top_k=100,
66
+ # top_p=0.8,
67
+ # With beam search
68
+ num_beams=5,
69
+ early_stopping=True
70
+ )
71
+ en_text = tokenizer_vi2en.batch_decode(output_ids, skip_special_tokens=True)
72
+ en_text = " ".join(en_text)
73
+ return en_text
74
+
75
+ tokenizer_en2vi = AutoTokenizer.from_pretrained("vinai/vinai-translate-en2vi", src_lang="en_XX")
76
+ model_en2vi = AutoModelForSeq2SeqLM.from_pretrained("vinai/vinai-translate-en2vi")
77
+
78
+ def translate_en2vi(en_text: str) -> str:
79
+ input_ids = tokenizer_en2vi(en_text, return_tensors="pt").input_ids
80
+ output_ids = model_en2vi.generate(
81
+ input_ids,
82
+ decoder_start_token_id=tokenizer_en2vi.lang_code_to_id["vi_VN"],
83
+ num_return_sequences=1,
84
+ # # With sampling
85
+ # do_sample=True,
86
+ # top_k=100,
87
+ # top_p=0.8,
88
+ # With beam search
89
+ num_beams=5,
90
+ early_stopping=True
91
+ )
92
+ vi_text = tokenizer_en2vi.batch_decode(output_ids, skip_special_tokens=True)
93
+ vi_text = " ".join(vi_text)
94
+ return vi_text