File size: 3,512 Bytes
ce1c4dd
 
 
 
6318241
1cf16a5
ce1c4dd
 
4307124
 
 
 
 
725ca4d
 
 
 
 
ddbcd26
ce1c4dd
 
d1882a9
 
 
8210be8
ce1c4dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8210be8
 
ce1c4dd
 
 
 
 
 
 
0101c12
2e7cfe8
 
0101c12
6d5dae6
340a52c
6d5dae6
340a52c
2923aa8
340a52c
6d5dae6
340a52c
ddbcd26
 
 
 
 
 
 
340a52c
 
d1882a9
340a52c
6d5dae6
2e7cfe8
0101c12
 
 
 
d1882a9
0101c12
d1882a9
2923aa8
0101c12
 
 
 
d1882a9
0101c12
 
 
 
d1882a9
 
76c5f4f
d1882a9
0101c12
 
 
ce1c4dd
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import torch

device = "cuda:0" if torch.cuda.is_available() else "cpu"

LANG_CODES = {
    "English":"en",
    "Romanian":"ro",
    "Spanish":"es",
    "Italian":"it",
    "German":"de",
    "Portugese":"pt",
    "French":"fr",
    "Dutch":"nl",
    "Chinese":"zh",
    "Japanese":"ja",
    "Korean":"ko",
    "Russian":"ru"
}

model = AutoModelForSeq2SeqLM.from_pretrained("facebook/m2m100_418M").to(device)
tokenizer = AutoTokenizer.from_pretrained("facebook/m2m100_418M")

def translate(text, src_lang, tgt_lang, candidates:int):
    """
    Translate the text from source lang to target lang
    """

    src = LANG_CODES.get(src_lang)
    tgt = LANG_CODES.get(tgt_lang)

    tokenizer.src_lang = src
    tokenizer.tgt_lang = tgt

    ins = tokenizer(text, return_tensors='pt').to(device)

    gen_args = {
            'return_dict_in_generate': True,
            'output_scores': True,
            'output_hidden_states': True,
            'length_penalty': 0.0,  # don't encourage longer or shorter output,
            'num_return_sequences': candidates,
            'num_beams':candidates,
            'forced_bos_token_id': tokenizer.lang_code_to_id[tgt]
        }
    

    outs = model.generate(**{**ins, **gen_args})
    output = tokenizer.batch_decode(outs.sequences, skip_special_tokens=True)

    return '\n'.join(output)

with gr.Blocks() as app:
    markdown="""
    # Translate any text to ANY language!
    
    ### Bună! 💬

    This is an English to Any Language / Any Language to English neural machine translation app.

    Input your text to translate, a source language and target language, and desired number of return sequences!

    Return sequences is formally known as alternative translations.
    If the main translation is not good for what tone you expect, you can increase return sequences and retranslate.
    It will show a list of alternative translations, alongside the main translation.

    Right now, this only supports 12 languages.

    I will add more later! So stay tuned!

    ### Model and Data
    This app uses Facebook/Meta AI's M2M100 418M param model for translation.
    
    ### This app is a machine and not all translations will be perfect.
    """

    with gr.Row():
        gr.Markdown(markdown)
        with gr.Column():
            input_text = gr.components.Textbox(label="Input Text", value="Hello, world! Have a nice day!")
            source_lang = gr.components.Dropdown(label="Source Language", value="English", choices=list(LANG_CODES.keys()))
            target_lang = gr.components.Dropdown(label="Target Language", value="Romanian", choices=list(LANG_CODES.keys()))
            return_seqs = gr.Slider(label="Number of return sequences", value=3, minimum=1, maximum=256, step=1)
            
            inputs=[input_text, source_lang, target_lang, return_seqs]
            outputs = gr.Textbox()

            translate_btn = gr.Button("Translate!")
            translate_btn.click(translate, inputs=inputs, outputs=outputs)

            gr.Examples(
                [
                    ["Hello! How are you?", "English", "Romanian", 3],
                    ["Mă numesc Popa Mihai și am 13 ani.", "Romanian", "English",  3],
                    ["Tu vreau cafea.", "Romanian", "Romanian", 3],
                    ["Do you needs coffee?", "English", "English", 3],
                ],
                inputs=inputs
            )

app.launch()