Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,15 +1,24 @@
|
|
1 |
import gradio as gr
|
2 |
import torch
|
3 |
from transformers import AutoModelForSeq2SeqLM, NllbTokenizer
|
4 |
-
|
5 |
-
# model_qm_ru_path = 'TSjB/mbart-large-52-qm-ru-v1'
|
6 |
MODEL_PATH = 'TSjB/NLLB-201-600M-QM-V1'
|
7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
# 2. Models
|
9 |
-
#tokenizer_ru_qm = MBart50Tokenizer.from_pretrained(model_ru_qm_path)
|
10 |
-
#tokenizer_qm_ru = MBart50Tokenizer.from_pretrained(model_qm_ru_path)
|
11 |
-
#model_ru_qm = MBartForConditionalGeneration.from_pretrained(model_ru_qm_path)
|
12 |
-
#model_qm_ru = MBartForConditionalGeneration.from_pretrained(model_qm_ru_path)
|
13 |
tokenizer = NllbTokenizer.from_pretrained(MODEL_PATH)
|
14 |
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH)
|
15 |
|
@@ -198,7 +207,6 @@ def fromModel(str, dialect = "qrc"):
|
|
198 |
str = str.replace("къ|гъ", "х")
|
199 |
return str
|
200 |
|
201 |
-
|
202 |
def toModel(str):
|
203 |
str = str.replace("дж", "j")
|
204 |
str = str.replace("Дж", "J")
|
@@ -312,41 +320,8 @@ def toModel(str):
|
|
312 |
str = str.replace("Нг", " N")
|
313 |
str = str.replace("НГ", " N")
|
314 |
return str
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
|
319 |
# 4. Translate function
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
#def translatePy(text, model, tokenizer, src='ru_RU', trg='qm_XX', max_length='auto', num_beams=3, repetition_penalty=5.0, train_mode=False, n_out=None, **kwargs):
|
324 |
-
# tokenizer.src_lang = src
|
325 |
-
# tokenizer.tgt_lang = trg
|
326 |
-
# encoded = tokenizer(text, return_tensors="pt", truncation=True, max_length=1024)
|
327 |
-
# if max_length == 'auto':
|
328 |
-
# max_length = int(32 + 1.5 * encoded.input_ids.shape[1])
|
329 |
-
# if train_mode:
|
330 |
-
# model.train()
|
331 |
-
# else:
|
332 |
-
# model.eval()
|
333 |
-
# generated_tokens = model.generate(
|
334 |
-
# **encoded.to(model.device),
|
335 |
-
# forced_bos_token_id=tokenizer.lang_code_to_id[trg],
|
336 |
-
# max_length=max_length,
|
337 |
-
# num_beams=num_beams,
|
338 |
-
# repetition_penalty=repetition_penalty,
|
339 |
-
# # early_stopping=True,
|
340 |
-
# num_return_sequences=n_out or 1,
|
341 |
-
# **kwargs
|
342 |
-
# )
|
343 |
-
# out = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
|
344 |
-
# if isinstance(text, str) and n_out is None:
|
345 |
-
# return out[0]
|
346 |
-
# return out
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
def translatePy(text, src_lang='rus_Cyrl', tgt_lang='krc_Cyrl',
|
351 |
a=32, b=3, max_input_length=1024, num_beams=3, **kwargs
|
352 |
):
|
@@ -366,27 +341,60 @@ def translatePy(text, src_lang='rus_Cyrl', tgt_lang='krc_Cyrl',
|
|
366 |
)
|
367 |
return tokenizer.batch_decode(result, skip_special_tokens=True)[0]
|
368 |
|
369 |
-
# 5. Translate
|
370 |
-
def transl(text, til, change_letters = True):
|
371 |
-
str = ''
|
372 |
-
if til == "Къарачай-Малкъар":
|
373 |
-
if change_letters == True:
|
374 |
-
str = translatePy(toModel(text), src_lang = 'krc_Cyrl', tgt_lang='rus_Cyrl')
|
375 |
-
else:
|
376 |
-
str = translatePy(text, src_lang = 'krc_Cyrl', tgt_lang='rus_Cyrl')
|
377 |
-
elif til == "Русский":
|
378 |
-
if change_letters == True:
|
379 |
-
str = translatePy(text, src_lang = 'rus_Cyrl', tgt_lang='krc_Cyrl')
|
380 |
-
str = fromModel(str)
|
381 |
-
else:
|
382 |
-
str = translatePy(text, src_lang = 'rus_Cyrl', tgt_lang='krc_Cyrl')
|
383 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
384 |
return str
|
385 |
|
386 |
-
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
392 |
demo.launch()
|
|
|
1 |
import gradio as gr
|
2 |
import torch
|
3 |
from transformers import AutoModelForSeq2SeqLM, NllbTokenizer
|
4 |
+
import pandas as pd
|
|
|
5 |
MODEL_PATH = 'TSjB/NLLB-201-600M-QM-V1'
|
6 |
|
7 |
+
# LANGUAGE = pd.DataFrame({"language": ["Къарачай-Малкъар тил", "Русский", "English", "Türk dili"], "token": ["krc_Cyrl", "rus_Cyrl", "eng_Latn", "tur_Latn"]})
|
8 |
+
LANGUAGE = pd.DataFrame({"language": ["Къарачай-Малкъар тил", "Русский"], "token": ["krc_Cyrl", "rus_Cyrl"]})
|
9 |
+
DIALECT = pd.DataFrame({"dialect": ["дж\ч", "ж\ч", "з\ц"], "short_name": ["qrc", "hlm", "mqr"]})
|
10 |
+
|
11 |
+
SYSTEM_LANG = "rus"
|
12 |
+
NAMES = pd.DataFrame({
|
13 |
+
"id": ["title", "from", "to", "your_sent", "transl_sent", "dialect", "translate", "annotation"],
|
14 |
+
"krc": ["# Къарачай-Малкъар кёчюрюўчю", "тилден", "тилге", "Мында джаз...", "Кёчюрюлгени", "Къарачай-Малкъарны диалекти", "Кёчюр","Къарачай-Малкъар тилде биринчи кёчюрюўчюдю. [Богдан Теўуналаны](https://t.me/bogdan_tewunalany), [Али Берберлени](https://t.me/ali_bulat1990) къурагъандыла\n\nМодель Орус бла Къарачай-Малкъар тилледе юйрене тургъаны себебли, Къарачай-Малкъар кёчюрюў башха тиллеге да осал болургъа боллукъду."],
|
15 |
+
"rus": ["# Карачаево-Балкарский переводчик", "из", "на", "Напишите здесь...", "Переведённый текст", "Карачаево-Балкарский диалект", "Перевести","Первый переводчик на карачаево-балкарский язык. Создан [Богданом Теунаевым](https://t.me/bogdan_tewunalany), [Али Берберовым](https://t.me/ali_bulat1990)\n\nТак как модель обучалась на парах Русский и Карачаево-Балкарский, то Карачаево-Балкарский перевод для остальных языков может быть хуже."],
|
16 |
+
"tur": ["# Karaçay-Malkar tercümanı", "dilden", "dile", "Buraya yaz...", "Çevrilmiş metin burada", "Karaçay-Malkar lehçesi", "Tercüme edin", "İlk çevirmen. [Bogdan Tewunalanı](https://t.me/bogdan_tewunalany), [Ali Berberov](https://t.me/ali_bulat1990) tarafından oluşturuldu\n\nModel Rusça ve Karaçay-Malkar çiftleri halinde eğitildiğinden, diğer diller için Karaçay-Malkar çevirisi daha kötü olabilir."],
|
17 |
+
"eng": ["# Qarachay-Malqar translator", "from", "to", "Write here...", "Translated text is here", "Qarachay-Malqar dialect", "Translate", "The first translator. Created by [Bogdan Tewunalany](https://t.me/bogdan_tewunalany), [Ali Berberov](https://t.me/ali_bulat1990)\n\nSince the model was trained in pairs of Russian and Qarachay-Malqar, the Qarachay-Malqar translation for other languages may be worse."]
|
18 |
+
})
|
19 |
+
|
20 |
+
|
21 |
# 2. Models
|
|
|
|
|
|
|
|
|
22 |
tokenizer = NllbTokenizer.from_pretrained(MODEL_PATH)
|
23 |
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH)
|
24 |
|
|
|
207 |
str = str.replace("къ|гъ", "х")
|
208 |
return str
|
209 |
|
|
|
210 |
def toModel(str):
|
211 |
str = str.replace("дж", "j")
|
212 |
str = str.replace("Дж", "J")
|
|
|
320 |
str = str.replace("Нг", " N")
|
321 |
str = str.replace("НГ", " N")
|
322 |
return str
|
|
|
|
|
|
|
323 |
|
324 |
# 4. Translate function
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
325 |
def translatePy(text, src_lang='rus_Cyrl', tgt_lang='krc_Cyrl',
|
326 |
a=32, b=3, max_input_length=1024, num_beams=3, **kwargs
|
327 |
):
|
|
|
341 |
)
|
342 |
return tokenizer.batch_decode(result, skip_special_tokens=True)[0]
|
343 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
344 |
|
345 |
+
# 5. Translate
|
346 |
+
def translateProcess(text, from_, to, dialect):
|
347 |
+
from_ = "".join(LANGUAGE[LANGUAGE.language == from_].token.to_list())
|
348 |
+
to = "".join(LANGUAGE[LANGUAGE.language == to].token.to_list())
|
349 |
+
dialect = "".join(DIALECT[DIALECT.dialect == dialect].short_name.to_list())
|
350 |
+
|
351 |
+
if from_ == 'krc_Cyrl':
|
352 |
+
text = toModel(text)
|
353 |
+
|
354 |
+
str = translatePy(text, src_lang = from_, tgt_lang = to)
|
355 |
+
|
356 |
+
if to1 == 'krc_Cyrl':
|
357 |
+
str = fromModel(str, dialect = dialect)
|
358 |
+
|
359 |
return str
|
360 |
|
361 |
+
|
362 |
+
_title = "".join(NAMES[NAMES.id == "title"][SYSTEM_LANG].to_list())
|
363 |
+
_from = "".join(NAMES[NAMES.id == "from"][SYSTEM_LANG].to_list())
|
364 |
+
_to = "".join(NAMES[NAMES.id == "to"][SYSTEM_LANG].to_list())
|
365 |
+
_your_sent = "".join(NAMES[NAMES.id == "your_sent"][SYSTEM_LANG].to_list())
|
366 |
+
_transl_sent = "".join(NAMES[NAMES.id == "transl_sent"][SYSTEM_LANG].to_list())
|
367 |
+
_dialect = "".join(NAMES[NAMES.id == "dialect"][SYSTEM_LANG].to_list())
|
368 |
+
_translate = "".join(NAMES[NAMES.id == "translate"][SYSTEM_LANG].to_list())
|
369 |
+
_annotation = "".join(NAMES[NAMES.id == "annotation"][SYSTEM_LANG].to_list())
|
370 |
+
|
371 |
+
with gr.Blocks() as demo:
|
372 |
+
gr.Markdown(_title)
|
373 |
+
with gr.Row():
|
374 |
+
choice_input = gr.Dropdown(
|
375 |
+
choices = LANGUAGE.language.to_list(), label=_from, value = "Русский")
|
376 |
+
|
377 |
+
with gr.Column():
|
378 |
+
with gr.Row():
|
379 |
+
with gr.Column():
|
380 |
+
choice_output = gr.Dropdown(
|
381 |
+
choices = LANGUAGE.language.to_list(), label=_to, value = "Къарачай-Малкъар тил")
|
382 |
+
with gr.Column():
|
383 |
+
dialect = gr.Dropdown(
|
384 |
+
choices = DIALECT.dialect.to_list(), label=_dialect, value = "дж\ч")
|
385 |
+
|
386 |
+
with gr.Row():
|
387 |
+
with gr.Column():
|
388 |
+
text_input = gr.Textbox(lines=15, placeholder=_your_sent, label = "")
|
389 |
+
|
390 |
+
with gr.Column():
|
391 |
+
text_output = gr.Textbox(lines=15, placeholder=_transl_sent, label = "")
|
392 |
+
|
393 |
+
text_button = gr.Button(_translate, variant = 'primary')
|
394 |
+
|
395 |
+
text_button.click(translateProcess, inputs=[text_input, choice_input, choice_output, dialect], outputs=[text_output]) # text, from, to, dialect
|
396 |
+
|
397 |
+
gr.Markdown(_annotation)
|
398 |
+
|
399 |
+
# 6. Launch
|
400 |
demo.launch()
|