Spaces:
Running
Running
import os | |
import re | |
from datetime import datetime | |
from typing import Dict | |
import gradio | |
import sign_language_translator as slt | |
DESCRIPTION = """Enter your text and select languages from the dropdowns, then click Submit to generate a video. [`Library Repository`](https://github.com/sign-language-translator/sign-language-translator) | |
The text is preprocessed, tokenized and rearranged and then each token is mapped to a prerecorded video which are concatenated and returned. [`Model Code`](https://github.com/sign-language-translator/sign-language-translator/blob/main/sign_language_translator/models/text_to_sign/concatenative_synthesis.py) | |
> **NOTE** | |
> - This model only supports a fixed vocabulary. See the [`*-dictionary-mapping.json`](https://github.com/sign-language-translator/sign-language-datasets/tree/main/parallel_texts) files for supported words. | |
> - This version needs to re-encode the generated video so that will take some extra time after translation. | |
> - Since this is a rule-based model, you will have to add **context** to ambiguous words (e.g. glass(material) vs glass(container)). | |
> - Some signs correspond to words very specific in a particular language so their mapping in other languages will not make sense (e.g. in pakistan-sign-language, signs were recorded in reference to common Urdu words, hence English words "for" & "to" etc do not map well to their original Urdu words "کے لئے" and "کو" etc). | |
""".strip() | |
TITLE = "Concatenative Synthesis: Rule Based Text to Sign Language Translator" | |
CUSTOM_JS = """<script> | |
const rtlLanguages = ["urdu", "arabic"]; | |
const keyMap = { | |
"urdu": { | |
"1": "۱", | |
"2": "۲", | |
"3": "۳", | |
"4": "۴", | |
"5": "۵", | |
"6": "٦", | |
"7": "۷", | |
"8": "۸", | |
"9": "۹", | |
"0": "۰", | |
"q": "ق", | |
"w": "و", | |
"e": "ع", | |
"r": "ر", | |
"t": "ت", | |
"y": "ے", | |
"u": "ء", | |
"i": "ی", | |
"o": "ہ", | |
"p": "پ", | |
"a": "ا", | |
"s": "س", | |
"d": "د", | |
"f": "ف", | |
"g": "گ", | |
"h": "ح", | |
"j": "ج", | |
"k": "ک", | |
"l": "ل", | |
"z": "ز", | |
"x": "ش", | |
"c": "چ", | |
"v": "ط", | |
"b": "ب", | |
"n": "ن", | |
"m": "م", | |
"R": "ڑ", | |
"T": "ٹ", | |
"Y": "َ", | |
"U": "ئ", | |
"I": "ِ", | |
"P": "ُ", | |
"A": "آ", | |
"S": "ص", | |
"D": "ڈ", | |
"F": "أ", | |
"G": "غ", | |
"H": "ھ", | |
"J": "ض", | |
"K": "خ", | |
"Z": "ذ", | |
"X": "ژ", | |
"C": "ث", | |
"V": "ظ", | |
"N": "ں", | |
",": "،", | |
".": "۔", | |
"?": "؟", | |
";": "؛", | |
}, | |
"hindi": { | |
"1": "१", | |
"2": "२", | |
"3": "३", | |
"4": "४", | |
"5": "५", | |
"6": "६", | |
"7": "७", | |
"8": "८", | |
"9": "९", | |
"0": "०", | |
"=": "ृ", | |
"!": "ऍ", | |
"@": "ॅ", | |
"#": "्र", | |
"$": "र्", | |
"%": "ज्ञ", | |
"^": "त्र", | |
"&": "क्ष", | |
"*": "श्र", | |
"_": "ः", | |
"+": "ऋ", | |
"q": "ौ", | |
"w": "ै", | |
"e": "ा", | |
"r": "ी", | |
"t": "ू", | |
"y": "ब", | |
"u": "ह", | |
"i": "ग", | |
"o": "द", | |
"p": "ज", | |
"[": "ड", | |
"]": "़", | |
'\\\\': "ॉ", | |
"Q": "औ", | |
"W": "ऐ", | |
"E": "आ", | |
"R": "ई", | |
"T": "ऊ", | |
"Y": "भ", | |
"U": "ङ", | |
"I": "घ", | |
"O": "ध", | |
"P": "झ", | |
"{": "ढ", | |
"}": "ञ", | |
"|": "ऑ", | |
"a": "ो", | |
"s": "े", | |
"d": "्", | |
"f": "ि", | |
"g": "ु", | |
"h": "प", | |
"j": "र", | |
"k": "क", | |
"l": "त", | |
";": "च", | |
"'": "ट", | |
"A": "ओ", | |
"S": "ए", | |
"D": "अ", | |
"F": "इ", | |
"G": "उ", | |
"H": "फ", | |
"J": "ऱ", | |
"K": "ख", | |
"L": "थ", | |
":": "छ", | |
'"': "ठ", | |
"z": "ॆ", | |
"x": "ं", | |
"c": "म", | |
"v": "न", | |
"b": "व", | |
"n": "ल", | |
"m": "स", | |
".": "।", | |
"/": "य", | |
"Z": "ऎ", | |
"X": "ँ", | |
"C": "ण", | |
"V": "ऩ", | |
"B": "ऴ", | |
"N": "ळ", | |
"M": "श", | |
"<": "ष", | |
">": "य़", | |
// "?":"य़", | |
} | |
}; | |
function updateTextareaDir(language) { | |
const sourceTextarea = document.getElementById("source-textbox").querySelector("textarea"); | |
if (rtlLanguages.includes(language)) { | |
sourceTextarea.setAttribute("dir", "rtl"); | |
} else { | |
sourceTextarea.setAttribute("dir", "ltr"); | |
} | |
function keypressHandler(event) { | |
const key = event.key; | |
if (keyMap[language].hasOwnProperty(key)) { | |
event.preventDefault(); | |
const mappedValue = keyMap[language][key]; | |
const start = sourceTextarea.selectionStart; | |
const end = sourceTextarea.selectionEnd; | |
sourceTextarea.value = sourceTextarea.value.slice(0, start) + mappedValue + sourceTextarea.value.slice(end); | |
sourceTextarea.selectionStart = sourceTextarea.selectionEnd = start + mappedValue.length; | |
} | |
} | |
sourceTextarea.removeEventListener("keypress", sourceTextarea.keypressHandler); | |
sourceTextarea.addEventListener("keypress", keypressHandler); | |
// Save the handler function to the textarea element for future removal | |
sourceTextarea.keypressHandler = keypressHandler; | |
} | |
</script> | |
""" | |
# todo: add dropdown keyboard custom component with key mapping | |
# todo: output full height | |
CUSTOM_CSS = """ | |
.reverse-row { | |
flex-direction: row-reverse; | |
} | |
#auto-complete-button { | |
border-color: var(--button-primary-border-color-hover); | |
} | |
""" | |
HF_TOKEN = os.getenv("HF_TOKEN") | |
request_logger = ( | |
gradio.HuggingFaceDatasetSaver( | |
HF_TOKEN, | |
"sltAI/crowdsourced-text-to-sign-language-rule-based-translation-corpus", | |
) | |
if HF_TOKEN | |
else gradio.CSVLogger() | |
) | |
translation_model = slt.models.ConcatenativeSynthesis("ur", "pk-sl", "video") | |
language_models: Dict[str, slt.models.BeamSampling] = {} | |
full_to_short = { | |
"english": "en", | |
"urdu": "ur", | |
"hindi": "hi", | |
} | |
short_to_full = {s: f for f, s in full_to_short.items()} | |
def auto_complete_text(model_code: str, text: str): | |
if model_code not in language_models: | |
lm = slt.get_model(model_code) | |
language_models[model_code] = slt.models.BeamSampling( | |
lm, # type: ignore | |
start_of_sequence_token=getattr(lm, "start_of_sequence_token", "<"), # type: ignore | |
end_of_sequence_token=getattr(lm, "end_of_sequence_token", ">"), # type: ignore | |
) | |
# todo: better tokenize/detokenize | |
tokens = [w for w in re.split(r"\b", text) if w] | |
lm = language_models[model_code] | |
lm.max_length = len(tokens) + 10 | |
completion, _ = lm.complete(tokens or None) | |
if completion[0] == lm.start_of_sequence_token: # type: ignore | |
completion = completion[1:] # type: ignore | |
if completion[-1] == lm.end_of_sequence_token: # type: ignore | |
completion = completion[:-1] # type: ignore | |
new_text = "".join(completion) | |
return new_text | |
def text_to_video( | |
text: str, | |
text_language: str, | |
sign_language: str, | |
sign_format: str = "video", | |
output_path: str = "output.mp4", | |
codec="h264", # ToDo: install h264 codec for opencv | |
): | |
translation_model.text_language = text_language | |
translation_model.sign_language = sign_language | |
translation_model.sign_format = sign_format | |
if sign_format == "landmarks": | |
translation_model.sign_embedding_model = "mediapipe-world" | |
sign = translation_model.translate(text) | |
if isinstance(sign, slt.Landmarks): | |
# large hands on sides | |
# sign.data[:, 33:] *= 2 | |
# sign.data[:, 33:54, 0] += 0.25 | |
# sign.data[:, 54:, 0] -= 0.25 | |
# hands moved to pose wrists | |
sign.data[:, 33:54, :3] += -sign.data[:, 33:34, :3] + sign.data[:, 15:16, :3] | |
sign.data[:, 54: , :3] += -sign.data[:, 54:55, :3] + sign.data[:, 16:17, :3] | |
sign.save_animation(output_path, overwrite=True) | |
else: | |
sign.save(output_path, overwrite=True, codec=codec) | |
# ToDo: video.watermark("Sign Language Translator\nAI Generated Video") | |
def translate(text: str, text_lang: str, sign_lang: str, sign_format: str): | |
text_lang = full_to_short.get(text_lang, text_lang) | |
log = [ | |
text, | |
text_lang, | |
sign_lang, | |
None, | |
datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f"), | |
] | |
try: | |
if text_lang == "en": | |
text = text[:1].lower() + text[1:] | |
path = "output.mp4" | |
text_to_video( | |
text, | |
text_lang, | |
sign_lang, | |
sign_format=sign_format, | |
output_path=path, | |
codec="mp4v", | |
) | |
request_logger.flag(log) | |
return path | |
except Exception as exc: | |
log[3] = str(exc) | |
request_logger.flag(log) | |
raise gradio.Error(f"Error during translation: {exc}") | |
with gradio.Blocks(title=TITLE, head=CUSTOM_JS, css=CUSTOM_CSS) as gradio_app: | |
gradio.Markdown(f"# {TITLE}") | |
gradio.Markdown(DESCRIPTION) | |
with gradio.Row(elem_classes=["reverse-row"]): # Inputs and Outputs | |
with gradio.Column(): # Outputs | |
gradio.Markdown("## Output Sign Language") | |
output_video = gradio.Video( | |
format="mp4", | |
label="Synthesized Sign Language Video", | |
autoplay=True, | |
show_download_button=True, | |
include_audio=False, | |
) | |
with gradio.Column(): # Inputs | |
gradio.Markdown("## Select Languages") | |
with gradio.Row(): | |
text_lang_dropdown = gradio.Dropdown( | |
choices=[ | |
short_to_full.get(code.value, code.value) | |
for code in slt.TextLanguageCodes | |
], | |
value=short_to_full.get( | |
slt.TextLanguageCodes.URDU.value, | |
slt.TextLanguageCodes.URDU.value, | |
), | |
label="Text Language", | |
elem_id="text-lang-dropdown", | |
) | |
text_lang_dropdown.change( | |
None, inputs=text_lang_dropdown, js="updateTextareaDir" | |
) | |
sign_lang_dropdown = gradio.Dropdown( | |
choices=[code.value for code in slt.SignLanguageCodes], | |
value=slt.SignLanguageCodes.PAKISTAN_SIGN_LANGUAGE.value, | |
label="Sign Language", | |
) | |
output_format_dropdown = gradio.Dropdown( | |
choices=[ | |
slt.SignFormatCodes.VIDEO.value, | |
slt.SignFormatCodes.LANDMARKS.value, | |
], | |
value=slt.SignFormatCodes.VIDEO.value, | |
label="Output Format", | |
) | |
# todo: sign format: video/landmarks (tabs?) | |
gradio.Markdown("## Input Text") | |
with gradio.Row(): # Source TextArea | |
source_textbox = gradio.Textbox( | |
lines=4, | |
placeholder="Enter Text Here...", | |
label="Spoken Language Sentence", | |
show_copy_button=True, | |
elem_id="source-textbox", | |
) | |
with gradio.Row(): # clear/auto-complete/Language Model | |
language_model_dropdown = gradio.Dropdown( | |
choices=[ | |
slt.ModelCodes.MIXER_LM_NGRAM_URDU.value, | |
slt.ModelCodes.TRANSFORMER_LM_UR_SUPPORTED.value, | |
], | |
value=slt.ModelCodes.MIXER_LM_NGRAM_URDU.value, | |
label="Select language model to Generate sample text", | |
) | |
auto_complete_button = gradio.Button( | |
"Auto-Complete", elem_id="auto-complete-button" | |
) | |
auto_complete_button.click( | |
auto_complete_text, | |
inputs=[language_model_dropdown, source_textbox], | |
outputs=[source_textbox], | |
api_name=False, | |
) | |
clear_button = gradio.ClearButton(source_textbox, api_name=False) | |
with gradio.Row(): # Translate Button | |
translate_button = gradio.Button("Translate", variant="primary") | |
translate_button.click( | |
translate, | |
inputs=[ | |
source_textbox, | |
text_lang_dropdown, | |
sign_lang_dropdown, | |
output_format_dropdown, | |
], | |
outputs=[output_video], | |
api_name="translate", | |
) | |
gradio.Examples( | |
[ | |
["We are here to use this.", "english", "pakistan-sign-language", "video"], | |
["i(me) admire art.", "english", "pakistan-sign-language", "landmarks"], | |
["یہ بہت اچھا ہے۔", "urdu", "pakistan-sign-language", "video"], | |
["وہ کام آسان تھا۔", "urdu", "pakistan-sign-language", "landmarks"], | |
["कैसे हैं आप?", "hindi", "pakistan-sign-language", "video"], | |
["पाँच घंटे।", "hindi", "pakistan-sign-language", "landmarks"], | |
], | |
inputs=[ | |
source_textbox, | |
text_lang_dropdown, | |
sign_lang_dropdown, | |
output_format_dropdown, | |
], | |
outputs=output_video, | |
) | |
request_logger.setup( | |
[ | |
source_textbox, | |
text_lang_dropdown, | |
sign_lang_dropdown, | |
gradio.Markdown(label="Exception"), | |
gradio.Markdown(label="Timestamp"), | |
], | |
"flagged", | |
) | |
gradio_app.load(None, inputs=[text_lang_dropdown], js="updateTextareaDir") | |
if __name__ == "__main__": | |
gradio_app.launch() | |