nllb / app.py
davanstrien's picture
davanstrien HF staff
Add import for regular expressions
620b306
raw
history blame
3.32 kB
import spaces
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from flores import code_mapping
import platform
import re
device = "cpu" if platform.system() == "Darwin" else "cuda"
MODEL_NAME = (
"facebook/nllb-200-distilled-600M"
if platform.system() == "Darwin"
else "facebook/nllb-200-3.3B"
)
code_mapping = dict(sorted(code_mapping.items(), key=lambda item: item[1]))
flores_codes = list(code_mapping.keys())
def load_model():
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
return model, tokenizer
model, tokenizer = load_model()
@spaces.GPU
def _translate(text: str, src_lang: str, tgt_lang: str):
source = code_mapping[src_lang]
target = code_mapping[tgt_lang]
translator = pipeline(
"translation",
model=model,
tokenizer=tokenizer,
src_lang=source,
tgt_lang=target,
device=device,
)
output = translator(text, max_length=400)
return output[0]["translation_text"]
def translate(text: str, src_lang: str, tgt_lang: str):
# split the input text into smaller chunks
outputs = ""
paragraph_chunks = text.split("\n")
for chunk in paragraph_chunks:
# check if the chunk is too long
if len(chunk) > 500:
# split on full stops, question marks, and exclamation marks
sentence_chunks = re.split(r"(?<=[.!?])\s+", chunk)
for sentence in sentence_chunks:
if sentence.strip(): # check if the sentence is not empty
outputs += f"{_translate(sentence, src_lang, tgt_lang)} "
outputs += "\n\n"
else:
outputs += _translate(chunk, src_lang, tgt_lang) + "\n\n"
return outputs.strip()
description = """
No Language Left Behind (NLLB) is a series of open-source models aiming to provide high-quality translations between 200 language.
This demo application allows you to use the NLLB model to translate text between a source and target language.
## Notes
- Whilst the model supports 200 languages, the quality of translations may vary between languages.
- "Low Resource" languages (languages which are less present on the internet and have a lower amount of investment) may have lower quality translations.
- The demo is not intended to be used for very long texts.
"""
instructions = """
1. Select the source and target language from the dropdown menus.
2. Enter the text you would like to translate.
3. Click the 'Translate text' button.
"""
with gr.Blocks() as demo:
gr.Markdown("# No Language Left Behind (NLLB) Translation Demo")
gr.Markdown(description)
gr.Markdown("## Instructions")
gr.Markdown(instructions)
with gr.Row():
src_lang = gr.Dropdown(label="Source Language", choices=flores_codes)
target_lang = gr.Dropdown(label="Target Language", choices=flores_codes)
with gr.Row():
input_text = gr.Textbox(label="Input Text", lines=6)
with gr.Row():
btn = gr.Button("Translate text")
with gr.Row():
output = gr.Textbox(label="Output Text", lines=6)
btn.click(
translate,
inputs=[input_text, src_lang, target_lang],
outputs=output,
)
demo.launch()