alimboff commited on
Commit
13b4ac4
·
verified ·
1 Parent(s): 463c82f

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +85 -0
  2. requirements.txt +2 -0
app.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import NllbTokenizer, AutoModelForSeq2SeqLM
2
+ import gradio as gr
3
+
4
+ model = AutoModelForSeq2SeqLM.from_pretrained('alimboff/nllb-200-kbd').cuda()
5
+ tokenizer = NllbTokenizer.from_pretrained('alimboff/nllb-200-kbd')
6
+
7
+ def fix_tokenizer(tokenizer, new_lang='kbd_Cyrl'):
8
+ """
9
+ Add a new language token to the tokenizer vocabulary
10
+ (this should be done each time after its initialization)
11
+ """
12
+ old_len = len(tokenizer) - int(new_lang in tokenizer.added_tokens_encoder)
13
+ tokenizer.lang_code_to_id[new_lang] = old_len-1
14
+ tokenizer.id_to_lang_code[old_len-1] = new_lang
15
+ # always move "mask" to the last position
16
+ tokenizer.fairseq_tokens_to_ids["<mask>"] = len(tokenizer.sp_model) + len(tokenizer.lang_code_to_id) + tokenizer.fairseq_offset
17
+
18
+ tokenizer.fairseq_tokens_to_ids.update(tokenizer.lang_code_to_id)
19
+ tokenizer.fairseq_ids_to_tokens = {v: k for k, v in tokenizer.fairseq_tokens_to_ids.items()}
20
+ if new_lang not in tokenizer._additional_special_tokens:
21
+ tokenizer._additional_special_tokens.append(new_lang)
22
+ # clear the added token encoder; otherwise a new token may end up there by mistake
23
+ tokenizer.added_tokens_encoder = {}
24
+ tokenizer.added_tokens_decoder = {}
25
+
26
+ fix_tokenizer(tokenizer)
27
+
28
+ language_codes = {
29
+ "Кабардино-Черкесский": "kbd_Cyrl",
30
+ "Русский": "rus_Cyrl"
31
+ }
32
+
33
+ def translate(
34
+ text, input_language, output_language,
35
+ a=32, b=3, max_input_length=1024, num_beams=8, **kwargs
36
+ ):
37
+ src_lang = language_codes[input_language]
38
+ tgt_lang = language_codes[output_language]
39
+ """Turn a text or a list of texts into a list of translations"""
40
+ tokenizer.src_lang = src_lang
41
+ tokenizer.tgt_lang = tgt_lang
42
+ inputs = tokenizer(
43
+ text, return_tensors='pt', padding=True, truncation=True,
44
+ max_length=max_input_length
45
+ )
46
+ model.eval() # turn off training mode
47
+ result = model.generate(
48
+ **inputs.to(model.device),
49
+ forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_lang),
50
+ max_new_tokens=int(a + b * inputs.input_ids.shape[1]),
51
+ num_beams=num_beams, **kwargs
52
+ )
53
+ return tokenizer.batch_decode(result, skip_special_tokens=True)[0] #без [0]
54
+
55
+ with gr.Blocks() as demo:
56
+ gr.Markdown("### Переводчик через ИИ")
57
+
58
+ with gr.Row():
59
+ input_language = gr.Radio(choices=["Кабардино-Черкесский", "Русский"], label="Выберите язык исходного текста", value="Кабардино-Черкесский")
60
+ output_language = gr.Radio(choices=["Кабардино-Черкесский", "Русский"], label="Выберите язык для перевода", value="Русский")
61
+
62
+ with gr.Row():
63
+ text_input = gr.Textbox(label="Введите текст для перевода")
64
+ text_output = gr.Textbox(label="Перевод", interactive=False)
65
+
66
+ with gr.Row():
67
+ translate_button = gr.Button("Перевести")
68
+
69
+ translate_button.click(
70
+ fn=translate,
71
+ inputs=[text_input, input_language, output_language],
72
+ outputs=text_output
73
+ )
74
+
75
+ demo.launch()
76
+
77
+ # # Example usage:
78
+ # # Ӏ
79
+ # t = 'пэшым лӀы зыбжанэ щӀэсщ'
80
+
81
+ # kbdru = translate(t, 'kbd_Cyrl', 'rus_Cyrl')
82
+ # rukbd = translate(kbdru, 'rus_Cyrl', 'kbd_Cyrl')
83
+
84
+ # print(kbdru)
85
+ # print(rukbd)
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ transformers
2
+ gradio