Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,10 +1,15 @@
|
|
|
|
1 |
import gradio as gr
|
2 |
import torch
|
3 |
from transformers import AutoModelForSeq2SeqLM, NllbTokenizer
|
4 |
import pandas as pd
|
|
|
|
|
|
|
5 |
MODEL_PATH = 'TSjB/NLLB-201-600M-QM-V2'
|
|
|
6 |
|
7 |
-
# LANGUAGE = pd.DataFrame({"language": ["Къарачай-Малкъар тил", "Русский", "English", "Türk dili"], "token": ["krc_Cyrl", "rus_Cyrl", "eng_Latn", "tur_Latn"]})
|
8 |
LANGUAGE = pd.DataFrame({"language": ["Къарачай-Малкъар тил", "Русский язык"], "token": ["krc_Cyrl", "rus_Cyrl"]})
|
9 |
DIALECT = pd.DataFrame({"dialect": ["дж\ч", "ж\ч", "з\ц"], "short_name": ["qrc", "hlm", "mqr"]})
|
10 |
|
@@ -18,18 +23,34 @@ SYSTEM_LANG = "rus"
|
|
18 |
# })
|
19 |
NAMES = pd.DataFrame({
|
20 |
"id": ["title", "from", "to", "your_sent", "transl_sent", "dialect", "translate", "annotation"],
|
21 |
-
"krc": ["# Къарачай-Малкъар кёчюрюўчю", "тилден", "тилге", "Мында джаз...", "Кёчюрюлгени", "Къарачай-Малкъарны диалекти", "Кёчюр","Къарачай-малкъар, орус тиллени арасында биринчи кёчюрюўчюдю. [Богдан Теўуналаны](https://t.me/bogdan_tewunalany), [Али Берберлени](https://t.me/ali_berberov) къурагъандыла\n\nСоинвестированиени эмда спонсорлукъ болушлукъну юсюнден [Али Берберовгъа](https://t.me/ali_berberov) соругъуз"],
|
22 |
-
"rus": ["# Карачаево-балкарский переводчик", "из", "на", "Напишите здесь...", "Переведённый текст", "Карачаево-балкарский диалект", "Перевести","Первый переводчик между карачаево-балкарским и русским языками. Разработчики: [Богдан Теунаев](https://t.me/bogdan_tewunalany), [Али Берберов](https://t.me/ali_berberov)\n\nПо вопросам соинвестирования и спонсорской поддержки обращайтесь к [Али Берберову](https://t.me/ali_berberov)"],
|
23 |
-
"tur": ["#
|
24 |
-
"eng": ["# Qarachay-Malqar translator", "from", "to", "Write here...", "Translated text is here", "Qarachay-Malqar dialect", "Translate", "The first translator between Qarachay-Malqar and Russian languages.
|
25 |
})
|
26 |
|
27 |
|
28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
tokenizer = NllbTokenizer.from_pretrained(MODEL_PATH)
|
30 |
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH)
|
31 |
|
32 |
-
#
|
33 |
def fixTokenizer(tokenizer, new_lang='krc_Cyrl'):
|
34 |
"""
|
35 |
Add a new language token to the tokenizer vocabulary
|
@@ -51,7 +72,7 @@ def fixTokenizer(tokenizer, new_lang='krc_Cyrl'):
|
|
51 |
|
52 |
fixTokenizer(tokenizer)
|
53 |
|
54 |
-
#
|
55 |
|
56 |
def fromModel(str, dialect = "qrc"):
|
57 |
if dialect == "qrc":
|
@@ -329,7 +350,7 @@ def toModel(str):
|
|
329 |
str = str.replace("НГ", " N")
|
330 |
return str
|
331 |
|
332 |
-
#
|
333 |
def translatePy(text, src_lang='rus_Cyrl', tgt_lang='krc_Cyrl',
|
334 |
a=32, b=3, max_input_length=1024, num_beams=3, **kwargs
|
335 |
):
|
@@ -350,31 +371,76 @@ def translatePy(text, src_lang='rus_Cyrl', tgt_lang='krc_Cyrl',
|
|
350 |
return tokenizer.batch_decode(result, skip_special_tokens=True)[0]
|
351 |
|
352 |
|
353 |
-
# 5. Translate
|
354 |
def translateProcess(text, from_, to, dialect):
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
if from_ == "" or from_ is None:
|
359 |
-
from_ = "Русский язык"
|
360 |
-
if to == "" or to is None:
|
361 |
-
to = "Къарачай-Малкъар тил"
|
362 |
-
|
363 |
-
from_ = "".join(LANGUAGE[LANGUAGE.language == from_].token.to_list())
|
364 |
-
to = "".join(LANGUAGE[LANGUAGE.language == to].token.to_list())
|
365 |
-
dialect = "".join(DIALECT[DIALECT.dialect == dialect].short_name.to_list())
|
366 |
-
|
367 |
if from_ == 'krc_Cyrl':
|
368 |
text = toModel(text)
|
369 |
|
370 |
-
|
371 |
|
372 |
if to == 'krc_Cyrl':
|
373 |
-
|
374 |
|
375 |
-
return
|
376 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
377 |
|
|
|
378 |
_title = "".join(NAMES[NAMES.id == "title"][SYSTEM_LANG].to_list())
|
379 |
_from = "".join(NAMES[NAMES.id == "from"][SYSTEM_LANG].to_list())
|
380 |
_to = "".join(NAMES[NAMES.id == "to"][SYSTEM_LANG].to_list())
|
@@ -388,28 +454,32 @@ with gr.Blocks() as demo:
|
|
388 |
gr.Markdown(_title)
|
389 |
with gr.Row():
|
390 |
choice_input = gr.Dropdown(
|
391 |
-
choices = LANGUAGE.language.to_list(), label=_from, value = "Русский язык", filterable = False)
|
|
|
392 |
|
393 |
with gr.Column():
|
394 |
with gr.Row():
|
395 |
choice_output = gr.Dropdown(
|
396 |
-
choices = LANGUAGE.language.to_list(), label=_to, value = "Къарачай-Малкъар тил"
|
397 |
|
398 |
dialect = gr.Dropdown(
|
399 |
-
choices = DIALECT.dialect.to_list(), label=_dialect, value = "дж\ч"
|
400 |
|
401 |
with gr.Row():
|
402 |
with gr.Column():
|
403 |
-
text_input = gr.Textbox(lines=15, placeholder=_your_sent, label = "")
|
404 |
|
405 |
with gr.Column():
|
406 |
-
text_output = gr.Textbox(lines=15, placeholder=_transl_sent, label = "")
|
407 |
|
408 |
text_button = gr.Button(_translate, variant = 'primary')
|
409 |
|
410 |
-
text_button.click(
|
411 |
|
412 |
gr.Markdown(_annotation)
|
413 |
|
414 |
-
#
|
415 |
-
demo.launch()
|
|
|
|
|
|
|
|
1 |
+
# 1. Libraries
|
2 |
import gradio as gr
|
3 |
import torch
|
4 |
from transformers import AutoModelForSeq2SeqLM, NllbTokenizer
|
5 |
import pandas as pd
|
6 |
+
from datasets import load_dataset
|
7 |
+
|
8 |
+
# 2. Constants
|
9 |
MODEL_PATH = 'TSjB/NLLB-201-600M-QM-V2'
|
10 |
+
DATA_PATH = "TSjB/dictionary_krc_rus"
|
11 |
|
12 |
+
# LANGUAGE = pd.DataFrame({"language": ["Къарачай-Малкъар тил", "Русский язык", "English", "Türk dili"], "token": ["krc_Cyrl", "rus_Cyrl", "eng_Latn", "tur_Latn"]})
|
13 |
LANGUAGE = pd.DataFrame({"language": ["Къарачай-Малкъар тил", "Русский язык"], "token": ["krc_Cyrl", "rus_Cyrl"]})
|
14 |
DIALECT = pd.DataFrame({"dialect": ["дж\ч", "ж\ч", "з\ц"], "short_name": ["qrc", "hlm", "mqr"]})
|
15 |
|
|
|
23 |
# })
|
24 |
NAMES = pd.DataFrame({
|
25 |
"id": ["title", "from", "to", "your_sent", "transl_sent", "dialect", "translate", "annotation"],
|
26 |
+
"krc": ["# Къарачай-Малкъар сёзлюк бла кёчюрюўчю", "тилден", "тилге", "Мында джаз...", "Кёчюрюлгени", "Къарачай-Малкъарны диалекти", "Кёчюр","Къарачай-малкъар, орус тиллени арасында биринчи кёчюрюўчюдю. Сёзлюк да ичине салыннганды.\n\n[Богдан Теўуналаны](https://t.me/bogdan_tewunalany), [Али Берберлени](https://t.me/ali_berberov) къурагъандыла\n\nСоинвестированиени эмда спонсорлукъ болушлукъну юсюнден [Али Берберовгъа](https://t.me/ali_berberov) соругъуз"],
|
27 |
+
"rus": ["# Карачаево-балкарский словарь и переводчик", "из", "на", "Напишите здесь...", "Переведённый текст", "Карачаево-балкарский диалект", "Перевести","Первый переводчик между карачаево-балкарским и русским языками. Также встроен словарь для отдельных слов или коротких фраз.\n\nРазработчики: [Богдан Теунаев](https://t.me/bogdan_tewunalany), [Али Берберов](https://t.me/ali_berberov)\n\nПо вопросам соинвестирования и спонсорской поддержки обращайтесь к [Али Берберову](https://t.me/ali_berberov)"],
|
28 |
+
"tur": ["# Karaçayca-Balkarca sözlük ve çevirmen", "dilden", "dile", "Buraya yaz...", "Çevrilmiş metin burada", "Karaçay-Malkar lehçesi", "Tercüme edin", "Karaçay-Balkarca ve Rusça dilleri arasındaki ilk çevirmen. Tek tek kelimeler veya kısa ifadeler için bir sözlük de yerleşiktir.\n\nGeliştiriciler: [Bogdan Tewunalanı](https://t.me/bogdan_tewunalany), [Ali Berberov](https://t.me/ali_berberov)\n\nOrtak yatırım ve sponsorluk ile ilgili sorularınız için [Ali Berberov](https://t.me/ali_berberov) ile iletişime geçin"],
|
29 |
+
"eng": ["# Qarachay-Malqar dictionary and translator", "from", "to", "Write here...", "Translated text is here", "Qarachay-Malqar dialect", "Translate", "The first translator between Qarachay-Malqar and Russian languages. A dictionary for individual words or short phrases is also built in.\n\nDevelopers: [Bogdan Tewunalany](https://t.me/bogdan_tewunalany), [Ali Berberov](https://t.me/ali_berberov)\n\nFor co-investment and sponsorship, please contact [Ali Berberov] (https://t.me/ali_berberov)"]
|
30 |
})
|
31 |
|
32 |
|
33 |
+
OUTPUT_ROW_BY_EVERY_DICTIONARY = 15
|
34 |
+
|
35 |
+
FILEPATH_SOURCE_PREPARED = "1.Data/Dictionary"
|
36 |
+
# dictionary = pd.read_csv("%s/dictionary.csv" % FILEPATH_SOURCE_PREPARED, sep = ";")
|
37 |
+
|
38 |
+
# 3. Upload
|
39 |
+
dictionary = load_dataset(DATA_PATH)
|
40 |
+
dictionary = pd.DataFrame(dictionary['train'])
|
41 |
+
|
42 |
+
dictionary["soz"] = dictionary.soz.str.upper()
|
43 |
+
dictionary["soz_l"] = dictionary.soz.str.lower()
|
44 |
+
dictionary["belgi_l"] = dictionary.belgi.str.lower()
|
45 |
+
|
46 |
+
dictionary_qm = dictionary[dictionary.til == "krc"]
|
47 |
+
dictionary_ru = dictionary[dictionary.til == "rus"]
|
48 |
+
|
49 |
+
|
50 |
tokenizer = NllbTokenizer.from_pretrained(MODEL_PATH)
|
51 |
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH)
|
52 |
|
53 |
+
# 4. Fix tokenizer
|
54 |
def fixTokenizer(tokenizer, new_lang='krc_Cyrl'):
|
55 |
"""
|
56 |
Add a new language token to the tokenizer vocabulary
|
|
|
72 |
|
73 |
fixTokenizer(tokenizer)
|
74 |
|
75 |
+
# 5. Change letters
|
76 |
|
77 |
def fromModel(str, dialect = "qrc"):
|
78 |
if dialect == "qrc":
|
|
|
350 |
str = str.replace("НГ", " N")
|
351 |
return str
|
352 |
|
353 |
+
# 6. Translate function
|
354 |
def translatePy(text, src_lang='rus_Cyrl', tgt_lang='krc_Cyrl',
|
355 |
a=32, b=3, max_input_length=1024, num_beams=3, **kwargs
|
356 |
):
|
|
|
371 |
return tokenizer.batch_decode(result, skip_special_tokens=True)[0]
|
372 |
|
373 |
|
|
|
374 |
def translateProcess(text, from_, to, dialect):
|
375 |
+
# print(from_)
|
376 |
+
# print(to)
|
377 |
+
# print(dialect)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
378 |
if from_ == 'krc_Cyrl':
|
379 |
text = toModel(text)
|
380 |
|
381 |
+
str_ = translatePy(text, src_lang = from_, tgt_lang = to)
|
382 |
|
383 |
if to == 'krc_Cyrl':
|
384 |
+
str_ = fromModel(str_, dialect = dialect)
|
385 |
|
386 |
+
return str_
|
387 |
+
|
388 |
+
# 7. Dictionary function
|
389 |
+
def dictionaryDisp(from_, text):
|
390 |
+
str_l = text.lower()
|
391 |
+
filter_ = r"\W+" + str_l + r"|^" + str_l
|
392 |
+
|
393 |
+
df_from_to = pd.DataFrame()
|
394 |
+
df_to_from = pd.DataFrame()
|
395 |
+
|
396 |
+
if from_ == 'krc_Cyrl':
|
397 |
+
df_from_to = dictionary_qm.copy()
|
398 |
+
df_to_from = dictionary_ru.copy()
|
399 |
+
elif from_ == 'rus_Cyrl':
|
400 |
+
df_from_to = dictionary_ru.copy()
|
401 |
+
df_to_from = dictionary_qm.copy()
|
402 |
+
|
403 |
+
sozluk_1 = df_from_to[df_from_to.soz_l.str.startswith(str_l)]
|
404 |
+
# Select rows based on the sequence and output
|
405 |
+
sozluk_1 = sozluk_1.iloc[:OUTPUT_ROW_BY_EVERY_DICTIONARY]
|
406 |
+
|
407 |
+
sozluk_2 = df_from_to[df_from_to.belgi_l.str.contains(filter_, regex=True)]
|
408 |
+
sozluk_2 = sozluk_2.iloc[:OUTPUT_ROW_BY_EVERY_DICTIONARY]
|
409 |
+
|
410 |
+
sozluk_3 = df_to_from[df_to_from.belgi_l.str.contains(filter_, regex=True)]
|
411 |
+
sozluk_3 = sozluk_3.iloc[:OUTPUT_ROW_BY_EVERY_DICTIONARY]
|
412 |
+
|
413 |
+
# Concatenate the DataFrames and drop duplicates
|
414 |
+
sozluk = pd.concat([sozluk_1, sozluk_2, sozluk_3], ignore_index=True).drop_duplicates()[["soz", "belgi"]]
|
415 |
+
sozluk = [x.soz + " ----- " + x.belgi + "\n\n----------\n\n" for x in sozluk.itertuples()]
|
416 |
+
sozluk = "".join(sozluk)
|
417 |
+
|
418 |
+
return sozluk
|
419 |
+
# len(sozluk)
|
420 |
+
|
421 |
+
|
422 |
+
# 8. Output function
|
423 |
+
def out(text, from_, to, dialect):
|
424 |
+
if dialect == "" or dialect is None:
|
425 |
+
dialect = "дж\ч"
|
426 |
+
if from_ == "" or from_ is None:
|
427 |
+
from_ = "Русский язык"
|
428 |
+
if to == "" or to is None:
|
429 |
+
to = "Къарачай-Малкъар тил"
|
430 |
+
|
431 |
+
from_ = "".join(LANGUAGE[LANGUAGE.language == from_].token.to_list())
|
432 |
+
to = "".join(LANGUAGE[LANGUAGE.language == to].token.to_list())
|
433 |
+
dialect = "".join(DIALECT[DIALECT.dialect == dialect].short_name.to_list())
|
434 |
+
|
435 |
+
str_ = dictionaryDisp(from_, text)
|
436 |
+
|
437 |
+
if(len(str_) == 0):
|
438 |
+
str_ = translateProcess(text, from_, to, dialect)
|
439 |
+
# str_ = "myaf"
|
440 |
+
|
441 |
+
return(str_)
|
442 |
|
443 |
+
# 9. Definition ui
|
444 |
_title = "".join(NAMES[NAMES.id == "title"][SYSTEM_LANG].to_list())
|
445 |
_from = "".join(NAMES[NAMES.id == "from"][SYSTEM_LANG].to_list())
|
446 |
_to = "".join(NAMES[NAMES.id == "to"][SYSTEM_LANG].to_list())
|
|
|
454 |
gr.Markdown(_title)
|
455 |
with gr.Row():
|
456 |
choice_input = gr.Dropdown(
|
457 |
+
# choices = LANGUAGE.language.to_list(), label=_from, value = "Русский язык", filterable = False)
|
458 |
+
choices = LANGUAGE.language.to_list(), label=_from, value = "Русский язык")
|
459 |
|
460 |
with gr.Column():
|
461 |
with gr.Row():
|
462 |
choice_output = gr.Dropdown(
|
463 |
+
choices = LANGUAGE.language.to_list(), label=_to, value = "Къарачай-Малкъар тил")
|
464 |
|
465 |
dialect = gr.Dropdown(
|
466 |
+
choices = DIALECT.dialect.to_list(), label=_dialect, value = "дж\ч")
|
467 |
|
468 |
with gr.Row():
|
469 |
with gr.Column():
|
470 |
+
text_input = gr.Textbox(lines=15, placeholder=_your_sent, label = "", show_copy_button=True)
|
471 |
|
472 |
with gr.Column():
|
473 |
+
text_output = gr.Textbox(lines=15, placeholder=_transl_sent, label = "", autoscroll=False, show_copy_button=True)
|
474 |
|
475 |
text_button = gr.Button(_translate, variant = 'primary')
|
476 |
|
477 |
+
text_button.click(out, inputs=[text_input, choice_input, choice_output, dialect], outputs=[text_output]) # text, from, to, dialect
|
478 |
|
479 |
gr.Markdown(_annotation)
|
480 |
|
481 |
+
# 10. Launch
|
482 |
+
demo.launch()
|
483 |
+
|
484 |
+
|
485 |
+
|