TSjB commited on
Commit
2224fa7
·
verified ·
1 Parent(s): ce51208

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -60
app.py CHANGED
@@ -1,15 +1,24 @@
1
  import gradio as gr
2
  import torch
3
  from transformers import AutoModelForSeq2SeqLM, NllbTokenizer
4
- # model_ru_qm_path = 'TSjB/mbart-large-52-ru-qm-v1'
5
- # model_qm_ru_path = 'TSjB/mbart-large-52-qm-ru-v1'
6
  MODEL_PATH = 'TSjB/NLLB-201-600M-QM-V1'
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  # 2. Models
9
- #tokenizer_ru_qm = MBart50Tokenizer.from_pretrained(model_ru_qm_path)
10
- #tokenizer_qm_ru = MBart50Tokenizer.from_pretrained(model_qm_ru_path)
11
- #model_ru_qm = MBartForConditionalGeneration.from_pretrained(model_ru_qm_path)
12
- #model_qm_ru = MBartForConditionalGeneration.from_pretrained(model_qm_ru_path)
13
  tokenizer = NllbTokenizer.from_pretrained(MODEL_PATH)
14
  model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH)
15
 
@@ -198,7 +207,6 @@ def fromModel(str, dialect = "qrc"):
198
  str = str.replace("къ|гъ", "х")
199
  return str
200
 
201
-
202
  def toModel(str):
203
  str = str.replace("дж", "j")
204
  str = str.replace("Дж", "J")
@@ -312,41 +320,8 @@ def toModel(str):
312
  str = str.replace("Нг", " N")
313
  str = str.replace("НГ", " N")
314
  return str
315
-
316
-
317
-
318
 
319
  # 4. Translate function
320
-
321
-
322
-
323
- #def translatePy(text, model, tokenizer, src='ru_RU', trg='qm_XX', max_length='auto', num_beams=3, repetition_penalty=5.0, train_mode=False, n_out=None, **kwargs):
324
- # tokenizer.src_lang = src
325
- # tokenizer.tgt_lang = trg
326
- # encoded = tokenizer(text, return_tensors="pt", truncation=True, max_length=1024)
327
- # if max_length == 'auto':
328
- # max_length = int(32 + 1.5 * encoded.input_ids.shape[1])
329
- # if train_mode:
330
- # model.train()
331
- # else:
332
- # model.eval()
333
- # generated_tokens = model.generate(
334
- # **encoded.to(model.device),
335
- # forced_bos_token_id=tokenizer.lang_code_to_id[trg],
336
- # max_length=max_length,
337
- # num_beams=num_beams,
338
- # repetition_penalty=repetition_penalty,
339
- # # early_stopping=True,
340
- # num_return_sequences=n_out or 1,
341
- # **kwargs
342
- # )
343
- # out = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
344
- # if isinstance(text, str) and n_out is None:
345
- # return out[0]
346
- # return out
347
-
348
-
349
-
350
  def translatePy(text, src_lang='rus_Cyrl', tgt_lang='krc_Cyrl',
351
  a=32, b=3, max_input_length=1024, num_beams=3, **kwargs
352
  ):
@@ -366,27 +341,60 @@ def translatePy(text, src_lang='rus_Cyrl', tgt_lang='krc_Cyrl',
366
  )
367
  return tokenizer.batch_decode(result, skip_special_tokens=True)[0]
368
 
369
- # 5. Translate
370
- def transl(text, til, change_letters = True):
371
- str = ''
372
- if til == "Къарачай-Малкъар":
373
- if change_letters == True:
374
- str = translatePy(toModel(text), src_lang = 'krc_Cyrl', tgt_lang='rus_Cyrl')
375
- else:
376
- str = translatePy(text, src_lang = 'krc_Cyrl', tgt_lang='rus_Cyrl')
377
- elif til == "Русский":
378
- if change_letters == True:
379
- str = translatePy(text, src_lang = 'rus_Cyrl', tgt_lang='krc_Cyrl')
380
- str = fromModel(str)
381
- else:
382
- str = translatePy(text, src_lang = 'rus_Cyrl', tgt_lang='krc_Cyrl')
383
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
384
  return str
385
 
386
- demo = gr.Interface(
387
- fn=transl,
388
- inputs=[gr.Textbox(lines=1, placeholder="Your sentence here...", label = "input"), gr.Radio(
389
- ["Къарачай-Малкъар", "Русский"], label="Language", value = "Русский"), gr.Checkbox(label="Change letter", info="It's for inner using", value = True)],
390
- outputs="text"
391
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
392
  demo.launch()
 
1
  import gradio as gr
2
  import torch
3
  from transformers import AutoModelForSeq2SeqLM, NllbTokenizer
4
+ import pandas as pd
 
5
  MODEL_PATH = 'TSjB/NLLB-201-600M-QM-V1'
6
 
7
+ # LANGUAGE = pd.DataFrame({"language": ["Къарачай-Малкъар тил", "Русский", "English", "Türk dili"], "token": ["krc_Cyrl", "rus_Cyrl", "eng_Latn", "tur_Latn"]})
8
+ LANGUAGE = pd.DataFrame({"language": ["Къарачай-Малкъар тил", "Русский"], "token": ["krc_Cyrl", "rus_Cyrl"]})
9
+ DIALECT = pd.DataFrame({"dialect": ["дж\ч", "ж\ч", "з\ц"], "short_name": ["qrc", "hlm", "mqr"]})
10
+
11
+ SYSTEM_LANG = "rus"
12
+ NAMES = pd.DataFrame({
13
+ "id": ["title", "from", "to", "your_sent", "transl_sent", "dialect", "translate", "annotation"],
14
+ "krc": ["# Къарачай-Малкъар кёчюрюўчю", "тилден", "тилге", "Мында джаз...", "Кёчюрюлгени", "Къарачай-Малкъарны диалекти", "Кёчюр","Къарачай-Малкъар тилде биринчи кёчюрюўчюдю. [Богдан Теўуналаны](https://t.me/bogdan_tewunalany), [Али Берберлени](https://t.me/ali_bulat1990) къурагъандыла\n\nМодель Орус бла Къарачай-Малкъар тилледе юйрене тургъаны себебли, Къарачай-Малкъар кёчюрюў башха тиллеге да осал болургъа боллукъду."],
15
+ "rus": ["# Карачаево-Балкарский переводчик", "из", "на", "Напишите здесь...", "Переведённый текст", "Карачаево-Балкарский диалект", "Перевести","Первый переводчик на карачаево-балкарский язык. Создан [Богданом Теунаевым](https://t.me/bogdan_tewunalany), [Али Берберовым](https://t.me/ali_bulat1990)\n\nТак как модель обучалась на парах Русский и Карачаево-Балкарский, то Карачаево-Балкарский перевод для остальных языков может быть хуже."],
16
+ "tur": ["# Karaçay-Malkar tercümanı", "dilden", "dile", "Buraya yaz...", "Çevrilmiş metin burada", "Karaçay-Malkar lehçesi", "Tercüme edin", "İlk çevirmen. [Bogdan Tewunalanı](https://t.me/bogdan_tewunalany), [Ali Berberov](https://t.me/ali_bulat1990) tarafından oluşturuldu\n\nModel Rusça ve Karaçay-Malkar çiftleri halinde eğitildiğinden, diğer diller için Karaçay-Malkar çevirisi daha kötü olabilir."],
17
+ "eng": ["# Qarachay-Malqar translator", "from", "to", "Write here...", "Translated text is here", "Qarachay-Malqar dialect", "Translate", "The first translator. Created by [Bogdan Tewunalany](https://t.me/bogdan_tewunalany), [Ali Berberov](https://t.me/ali_bulat1990)\n\nSince the model was trained in pairs of Russian and Qarachay-Malqar, the Qarachay-Malqar translation for other languages may be worse."]
18
+ })
19
+
20
+
21
  # 2. Models
 
 
 
 
22
  tokenizer = NllbTokenizer.from_pretrained(MODEL_PATH)
23
  model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH)
24
 
 
207
  str = str.replace("къ|гъ", "х")
208
  return str
209
 
 
210
  def toModel(str):
211
  str = str.replace("дж", "j")
212
  str = str.replace("Дж", "J")
 
320
  str = str.replace("Нг", " N")
321
  str = str.replace("НГ", " N")
322
  return str
 
 
 
323
 
324
  # 4. Translate function
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
325
  def translatePy(text, src_lang='rus_Cyrl', tgt_lang='krc_Cyrl',
326
  a=32, b=3, max_input_length=1024, num_beams=3, **kwargs
327
  ):
 
341
  )
342
  return tokenizer.batch_decode(result, skip_special_tokens=True)[0]
343
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
344
 
345
+ # 5. Translate
346
+ def translateProcess(text, from_, to, dialect):
347
+ from_ = "".join(LANGUAGE[LANGUAGE.language == from_].token.to_list())
348
+ to = "".join(LANGUAGE[LANGUAGE.language == to].token.to_list())
349
+ dialect = "".join(DIALECT[DIALECT.dialect == dialect].short_name.to_list())
350
+
351
+ if from_ == 'krc_Cyrl':
352
+ text = toModel(text)
353
+
354
+ str = translatePy(text, src_lang = from_, tgt_lang = to)
355
+
356
+ if to1 == 'krc_Cyrl':
357
+ str = fromModel(str, dialect = dialect)
358
+
359
  return str
360
 
361
+
362
+ _title = "".join(NAMES[NAMES.id == "title"][SYSTEM_LANG].to_list())
363
+ _from = "".join(NAMES[NAMES.id == "from"][SYSTEM_LANG].to_list())
364
+ _to = "".join(NAMES[NAMES.id == "to"][SYSTEM_LANG].to_list())
365
+ _your_sent = "".join(NAMES[NAMES.id == "your_sent"][SYSTEM_LANG].to_list())
366
+ _transl_sent = "".join(NAMES[NAMES.id == "transl_sent"][SYSTEM_LANG].to_list())
367
+ _dialect = "".join(NAMES[NAMES.id == "dialect"][SYSTEM_LANG].to_list())
368
+ _translate = "".join(NAMES[NAMES.id == "translate"][SYSTEM_LANG].to_list())
369
+ _annotation = "".join(NAMES[NAMES.id == "annotation"][SYSTEM_LANG].to_list())
370
+
371
+ with gr.Blocks() as demo:
372
+ gr.Markdown(_title)
373
+ with gr.Row():
374
+ choice_input = gr.Dropdown(
375
+ choices = LANGUAGE.language.to_list(), label=_from, value = "Русский")
376
+
377
+ with gr.Column():
378
+ with gr.Row():
379
+ with gr.Column():
380
+ choice_output = gr.Dropdown(
381
+ choices = LANGUAGE.language.to_list(), label=_to, value = "Къарачай-Малкъар тил")
382
+ with gr.Column():
383
+ dialect = gr.Dropdown(
384
+ choices = DIALECT.dialect.to_list(), label=_dialect, value = "дж\ч")
385
+
386
+ with gr.Row():
387
+ with gr.Column():
388
+ text_input = gr.Textbox(lines=15, placeholder=_your_sent, label = "")
389
+
390
+ with gr.Column():
391
+ text_output = gr.Textbox(lines=15, placeholder=_transl_sent, label = "")
392
+
393
+ text_button = gr.Button(_translate, variant = 'primary')
394
+
395
+ text_button.click(translateProcess, inputs=[text_input, choice_input, choice_output, dialect], outputs=[text_output]) # text, from, to, dialect
396
+
397
+ gr.Markdown(_annotation)
398
+
399
+ # 6. Launch
400
  demo.launch()