TSjB commited on
Commit
37728ff
·
verified ·
1 Parent(s): 6e9d47c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +104 -34
app.py CHANGED
@@ -1,10 +1,15 @@
 
1
  import gradio as gr
2
  import torch
3
  from transformers import AutoModelForSeq2SeqLM, NllbTokenizer
4
  import pandas as pd
 
 
 
5
  MODEL_PATH = 'TSjB/NLLB-201-600M-QM-V2'
 
6
 
7
- # LANGUAGE = pd.DataFrame({"language": ["Къарачай-Малкъар тил", "Русский", "English", "Türk dili"], "token": ["krc_Cyrl", "rus_Cyrl", "eng_Latn", "tur_Latn"]})
8
  LANGUAGE = pd.DataFrame({"language": ["Къарачай-Малкъар тил", "Русский язык"], "token": ["krc_Cyrl", "rus_Cyrl"]})
9
  DIALECT = pd.DataFrame({"dialect": ["дж\ч", "ж\ч", "з\ц"], "short_name": ["qrc", "hlm", "mqr"]})
10
 
@@ -18,18 +23,34 @@ SYSTEM_LANG = "rus"
18
  # })
19
  NAMES = pd.DataFrame({
20
  "id": ["title", "from", "to", "your_sent", "transl_sent", "dialect", "translate", "annotation"],
21
- "krc": ["# Къарачай-Малкъар кёчюрюўчю", "тилден", "тилге", "Мында джаз...", "Кёчюрюлгени", "Къарачай-Малкъарны диалекти", "Кёчюр","Къарачай-малкъар, орус тиллени арасында биринчи кёчюрюўчюдю. [Богдан Теўуналаны](https://t.me/bogdan_tewunalany), [Али Берберлени](https://t.me/ali_berberov) къурагъандыла\n\nСоинвестированиени эмда спонсорлукъ болушлукъну юсюнден [Али Берберовгъа](https://t.me/ali_berberov) соругъуз"],
22
- "rus": ["# Карачаево-балкарский переводчик", "из", "на", "Напишите здесь...", "Переведённый текст", "Карачаево-балкарский диалект", "Перевести","Первый переводчик между карачаево-балкарским и русским языками. Разработчики: [Богдан Теунаев](https://t.me/bogdan_tewunalany), [Али Берберов](https://t.me/ali_berberov)\n\nПо вопросам соинвестирования и спонсорской поддержки обращайтесь к [Али Берберову](https://t.me/ali_berberov)"],
23
- "tur": ["# Karaçay-Malkar tercümanı", "dilden", "dile", "Buraya yaz...", "Çevrilmiş metin burada", "Karaçay-Malkar lehçesi", "Tercüme edin", "Karaçay-Balkarca ve Rusça dilleri arasındaki ilk çevirmen. Geliştiriciler: [Bogdan Tewunalanı](https://t.me/bogdan_tewunalany), [Ali Berberov](https://t.me/ali_berberov)\n\nOrtak yatırım ve sponsorluk ile ilgili sorularınız için [Ali Berberov](https://t.me/ali_berberov) ile iletişime geçin"],
24
- "eng": ["# Qarachay-Malqar translator", "from", "to", "Write here...", "Translated text is here", "Qarachay-Malqar dialect", "Translate", "The first translator between Qarachay-Malqar and Russian languages. Developers: [Bogdan Tewunalany](https://t.me/bogdan_tewunalany), [Ali Berberov](https://t.me/ali_berberov)\n\nFor co-investment and sponsorship, please contact [Ali Berberov] (https://t.me/ali_berberov)"]
25
  })
26
 
27
 
28
- # 2. Models
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  tokenizer = NllbTokenizer.from_pretrained(MODEL_PATH)
30
  model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH)
31
 
32
- # 3. Fix tokenizer
33
  def fixTokenizer(tokenizer, new_lang='krc_Cyrl'):
34
  """
35
  Add a new language token to the tokenizer vocabulary
@@ -51,7 +72,7 @@ def fixTokenizer(tokenizer, new_lang='krc_Cyrl'):
51
 
52
  fixTokenizer(tokenizer)
53
 
54
- # 4. Change letters
55
 
56
  def fromModel(str, dialect = "qrc"):
57
  if dialect == "qrc":
@@ -329,7 +350,7 @@ def toModel(str):
329
  str = str.replace("НГ", " N")
330
  return str
331
 
332
- # 4. Translate function
333
  def translatePy(text, src_lang='rus_Cyrl', tgt_lang='krc_Cyrl',
334
  a=32, b=3, max_input_length=1024, num_beams=3, **kwargs
335
  ):
@@ -350,31 +371,76 @@ def translatePy(text, src_lang='rus_Cyrl', tgt_lang='krc_Cyrl',
350
  return tokenizer.batch_decode(result, skip_special_tokens=True)[0]
351
 
352
 
353
- # 5. Translate
354
  def translateProcess(text, from_, to, dialect):
355
-
356
- if dialect == "" or dialect is None:
357
- dialect = "дж\ч"
358
- if from_ == "" or from_ is None:
359
- from_ = "Русский язык"
360
- if to == "" or to is None:
361
- to = "Къарачай-Малкъар тил"
362
-
363
- from_ = "".join(LANGUAGE[LANGUAGE.language == from_].token.to_list())
364
- to = "".join(LANGUAGE[LANGUAGE.language == to].token.to_list())
365
- dialect = "".join(DIALECT[DIALECT.dialect == dialect].short_name.to_list())
366
-
367
  if from_ == 'krc_Cyrl':
368
  text = toModel(text)
369
 
370
- str = translatePy(text, src_lang = from_, tgt_lang = to)
371
 
372
  if to == 'krc_Cyrl':
373
- str = fromModel(str, dialect = dialect)
374
 
375
- return str
376
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
377
 
 
378
  _title = "".join(NAMES[NAMES.id == "title"][SYSTEM_LANG].to_list())
379
  _from = "".join(NAMES[NAMES.id == "from"][SYSTEM_LANG].to_list())
380
  _to = "".join(NAMES[NAMES.id == "to"][SYSTEM_LANG].to_list())
@@ -388,28 +454,32 @@ with gr.Blocks() as demo:
388
  gr.Markdown(_title)
389
  with gr.Row():
390
  choice_input = gr.Dropdown(
391
- choices = LANGUAGE.language.to_list(), label=_from, value = "Русский язык", filterable = False)
 
392
 
393
  with gr.Column():
394
  with gr.Row():
395
  choice_output = gr.Dropdown(
396
- choices = LANGUAGE.language.to_list(), label=_to, value = "Къарачай-Малкъар тил", filterable = False)
397
 
398
  dialect = gr.Dropdown(
399
- choices = DIALECT.dialect.to_list(), label=_dialect, value = "дж\ч", filterable = False)
400
 
401
  with gr.Row():
402
  with gr.Column():
403
- text_input = gr.Textbox(lines=15, placeholder=_your_sent, label = "")
404
 
405
  with gr.Column():
406
- text_output = gr.Textbox(lines=15, placeholder=_transl_sent, label = "")
407
 
408
  text_button = gr.Button(_translate, variant = 'primary')
409
 
410
- text_button.click(translateProcess, inputs=[text_input, choice_input, choice_output, dialect], outputs=[text_output]) # text, from, to, dialect
411
 
412
  gr.Markdown(_annotation)
413
 
414
- # 6. Launch
415
- demo.launch()
 
 
 
 
1
+ # 1. Libraries
2
  import gradio as gr
3
  import torch
4
  from transformers import AutoModelForSeq2SeqLM, NllbTokenizer
5
  import pandas as pd
6
+ from datasets import load_dataset
7
+
8
+ # 2. Constants
9
  MODEL_PATH = 'TSjB/NLLB-201-600M-QM-V2'
10
+ DATA_PATH = "TSjB/dictionary_krc_rus"
11
 
12
+ # LANGUAGE = pd.DataFrame({"language": ["Къарачай-Малкъар тил", "Русский язык", "English", "Türk dili"], "token": ["krc_Cyrl", "rus_Cyrl", "eng_Latn", "tur_Latn"]})
13
  LANGUAGE = pd.DataFrame({"language": ["Къарачай-Малкъар тил", "Русский язык"], "token": ["krc_Cyrl", "rus_Cyrl"]})
14
  DIALECT = pd.DataFrame({"dialect": ["дж\ч", "ж\ч", "з\ц"], "short_name": ["qrc", "hlm", "mqr"]})
15
 
 
23
  # })
24
  NAMES = pd.DataFrame({
25
  "id": ["title", "from", "to", "your_sent", "transl_sent", "dialect", "translate", "annotation"],
26
+ "krc": ["# Къарачай-Малкъар сёзлюк бла кёчюрюўчю", "тилден", "тилге", "Мында джаз...", "Кёчюрюлгени", "Къарачай-Малкъарны диалекти", "Кёчюр","Къарачай-малкъар, орус тиллени арасында биринчи кёчюрюўчюдю. Сёзлюк да ичине салыннганды.\n\n[Богдан Теўуналаны](https://t.me/bogdan_tewunalany), [Али Берберлени](https://t.me/ali_berberov) къурагъандыла\n\nСоинвестированиени эмда спонсорлукъ болушлукъну юсюнден [Али Берберовгъа](https://t.me/ali_berberov) соругъуз"],
27
+ "rus": ["# Карачаево-балкарский словарь и переводчик", "из", "на", "Напишите здесь...", "Переведённый текст", "Карачаево-балкарский диалект", "Перевести","Первый переводчик между карачаево-балкарским и русским языками. Также встроен словарь для отдельных слов или коротких фраз.\n\nРазработчики: [Богдан Теунаев](https://t.me/bogdan_tewunalany), [Али Берберов](https://t.me/ali_berberov)\n\nПо вопросам соинвестирования и спонсорской поддержки обращайтесь к [Али Берберову](https://t.me/ali_berberov)"],
28
+ "tur": ["# Karaçayca-Balkarca sözlük ve çevirmen", "dilden", "dile", "Buraya yaz...", "Çevrilmiş metin burada", "Karaçay-Malkar lehçesi", "Tercüme edin", "Karaçay-Balkarca ve Rusça dilleri arasındaki ilk çevirmen. Tek tek kelimeler veya kısa ifadeler için bir sözlük de yerleşiktir.\n\nGeliştiriciler: [Bogdan Tewunalanı](https://t.me/bogdan_tewunalany), [Ali Berberov](https://t.me/ali_berberov)\n\nOrtak yatırım ve sponsorluk ile ilgili sorularınız için [Ali Berberov](https://t.me/ali_berberov) ile iletişime geçin"],
29
+ "eng": ["# Qarachay-Malqar dictionary and translator", "from", "to", "Write here...", "Translated text is here", "Qarachay-Malqar dialect", "Translate", "The first translator between Qarachay-Malqar and Russian languages. A dictionary for individual words or short phrases is also built in.\n\nDevelopers: [Bogdan Tewunalany](https://t.me/bogdan_tewunalany), [Ali Berberov](https://t.me/ali_berberov)\n\nFor co-investment and sponsorship, please contact [Ali Berberov] (https://t.me/ali_berberov)"]
30
  })
31
 
32
 
33
+ OUTPUT_ROW_BY_EVERY_DICTIONARY = 15
34
+
35
+ FILEPATH_SOURCE_PREPARED = "1.Data/Dictionary"
36
+ # dictionary = pd.read_csv("%s/dictionary.csv" % FILEPATH_SOURCE_PREPARED, sep = ";")
37
+
38
+ # 3. Upload
39
+ dictionary = load_dataset(DATA_PATH)
40
+ dictionary = pd.DataFrame(dictionary['train'])
41
+
42
+ dictionary["soz"] = dictionary.soz.str.upper()
43
+ dictionary["soz_l"] = dictionary.soz.str.lower()
44
+ dictionary["belgi_l"] = dictionary.belgi.str.lower()
45
+
46
+ dictionary_qm = dictionary[dictionary.til == "krc"]
47
+ dictionary_ru = dictionary[dictionary.til == "rus"]
48
+
49
+
50
  tokenizer = NllbTokenizer.from_pretrained(MODEL_PATH)
51
  model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH)
52
 
53
+ # 4. Fix tokenizer
54
  def fixTokenizer(tokenizer, new_lang='krc_Cyrl'):
55
  """
56
  Add a new language token to the tokenizer vocabulary
 
72
 
73
  fixTokenizer(tokenizer)
74
 
75
+ # 5. Change letters
76
 
77
  def fromModel(str, dialect = "qrc"):
78
  if dialect == "qrc":
 
350
  str = str.replace("НГ", " N")
351
  return str
352
 
353
+ # 6. Translate function
354
  def translatePy(text, src_lang='rus_Cyrl', tgt_lang='krc_Cyrl',
355
  a=32, b=3, max_input_length=1024, num_beams=3, **kwargs
356
  ):
 
371
  return tokenizer.batch_decode(result, skip_special_tokens=True)[0]
372
 
373
 
 
374
  def translateProcess(text, from_, to, dialect):
375
+ # print(from_)
376
+ # print(to)
377
+ # print(dialect)
 
 
 
 
 
 
 
 
 
378
  if from_ == 'krc_Cyrl':
379
  text = toModel(text)
380
 
381
+ str_ = translatePy(text, src_lang = from_, tgt_lang = to)
382
 
383
  if to == 'krc_Cyrl':
384
+ str_ = fromModel(str_, dialect = dialect)
385
 
386
+ return str_
387
+
388
+ # 7. Dictionary function
389
+ def dictionaryDisp(from_, text):
390
+ str_l = text.lower()
391
+ filter_ = r"\W+" + str_l + r"|^" + str_l
392
+
393
+ df_from_to = pd.DataFrame()
394
+ df_to_from = pd.DataFrame()
395
+
396
+ if from_ == 'krc_Cyrl':
397
+ df_from_to = dictionary_qm.copy()
398
+ df_to_from = dictionary_ru.copy()
399
+ elif from_ == 'rus_Cyrl':
400
+ df_from_to = dictionary_ru.copy()
401
+ df_to_from = dictionary_qm.copy()
402
+
403
+ sozluk_1 = df_from_to[df_from_to.soz_l.str.startswith(str_l)]
404
+ # Select rows based on the sequence and output
405
+ sozluk_1 = sozluk_1.iloc[:OUTPUT_ROW_BY_EVERY_DICTIONARY]
406
+
407
+ sozluk_2 = df_from_to[df_from_to.belgi_l.str.contains(filter_, regex=True)]
408
+ sozluk_2 = sozluk_2.iloc[:OUTPUT_ROW_BY_EVERY_DICTIONARY]
409
+
410
+ sozluk_3 = df_to_from[df_to_from.belgi_l.str.contains(filter_, regex=True)]
411
+ sozluk_3 = sozluk_3.iloc[:OUTPUT_ROW_BY_EVERY_DICTIONARY]
412
+
413
+ # Concatenate the DataFrames and drop duplicates
414
+ sozluk = pd.concat([sozluk_1, sozluk_2, sozluk_3], ignore_index=True).drop_duplicates()[["soz", "belgi"]]
415
+ sozluk = [x.soz + " ----- " + x.belgi + "\n\n----------\n\n" for x in sozluk.itertuples()]
416
+ sozluk = "".join(sozluk)
417
+
418
+ return sozluk
419
+ # len(sozluk)
420
+
421
+
422
+ # 8. Output function
423
+ def out(text, from_, to, dialect):
424
+ if dialect == "" or dialect is None:
425
+ dialect = "дж\ч"
426
+ if from_ == "" or from_ is None:
427
+ from_ = "Русский язык"
428
+ if to == "" or to is None:
429
+ to = "Къарачай-Малкъар тил"
430
+
431
+ from_ = "".join(LANGUAGE[LANGUAGE.language == from_].token.to_list())
432
+ to = "".join(LANGUAGE[LANGUAGE.language == to].token.to_list())
433
+ dialect = "".join(DIALECT[DIALECT.dialect == dialect].short_name.to_list())
434
+
435
+ str_ = dictionaryDisp(from_, text)
436
+
437
+ if(len(str_) == 0):
438
+ str_ = translateProcess(text, from_, to, dialect)
439
+ # str_ = "myaf"
440
+
441
+ return(str_)
442
 
443
+ # 9. Definition ui
444
  _title = "".join(NAMES[NAMES.id == "title"][SYSTEM_LANG].to_list())
445
  _from = "".join(NAMES[NAMES.id == "from"][SYSTEM_LANG].to_list())
446
  _to = "".join(NAMES[NAMES.id == "to"][SYSTEM_LANG].to_list())
 
454
  gr.Markdown(_title)
455
  with gr.Row():
456
  choice_input = gr.Dropdown(
457
+ # choices = LANGUAGE.language.to_list(), label=_from, value = "Русский язык", filterable = False)
458
+ choices = LANGUAGE.language.to_list(), label=_from, value = "Русский язык")
459
 
460
  with gr.Column():
461
  with gr.Row():
462
  choice_output = gr.Dropdown(
463
+ choices = LANGUAGE.language.to_list(), label=_to, value = "Къарачай-Малкъар тил")
464
 
465
  dialect = gr.Dropdown(
466
+ choices = DIALECT.dialect.to_list(), label=_dialect, value = "дж\ч")
467
 
468
  with gr.Row():
469
  with gr.Column():
470
+ text_input = gr.Textbox(lines=15, placeholder=_your_sent, label = "", show_copy_button=True)
471
 
472
  with gr.Column():
473
+ text_output = gr.Textbox(lines=15, placeholder=_transl_sent, label = "", autoscroll=False, show_copy_button=True)
474
 
475
  text_button = gr.Button(_translate, variant = 'primary')
476
 
477
+ text_button.click(out, inputs=[text_input, choice_input, choice_output, dialect], outputs=[text_output]) # text, from, to, dialect
478
 
479
  gr.Markdown(_annotation)
480
 
481
+ # 10. Launch
482
+ demo.launch()
483
+
484
+
485
+