mdsr's picture
lowercase initial of a sentence (temp fix)
80e12ba
import os
import re
from datetime import datetime
from typing import Dict
import gradio
import sign_language_translator as slt
DESCRIPTION = """Enter your text and select languages from the dropdowns, then click Submit to generate a video. [`Library Repository`](https://github.com/sign-language-translator/sign-language-translator)
The text is preprocessed, tokenized and rearranged and then each token is mapped to a prerecorded video which are concatenated and returned. [`Model Code`](https://github.com/sign-language-translator/sign-language-translator/blob/main/sign_language_translator/models/text_to_sign/concatenative_synthesis.py)
> **NOTE**
> - This model only supports a fixed vocabulary. See the [`*-dictionary-mapping.json`](https://github.com/sign-language-translator/sign-language-datasets/tree/main/parallel_texts) files for supported words.
> - This version needs to re-encode the generated video so that will take some extra time after translation.
> - Since this is a rule-based model, you will have to add **context** to ambiguous words (e.g. glass(material) vs glass(container)).
> - Some signs correspond to words very specific in a particular language so their mapping in other languages will not make sense (e.g. in pakistan-sign-language, signs were recorded in reference to common Urdu words, hence English words "for" & "to" etc do not map well to their original Urdu words "کے لئے" and "کو" etc).
""".strip()
TITLE = "Concatenative Synthesis: Rule Based Text to Sign Language Translator"
CUSTOM_JS = """<script>
const rtlLanguages = ["urdu", "arabic"];
const keyMap = {
"urdu": {
"1": "۱",
"2": "۲",
"3": "۳",
"4": "۴",
"5": "۵",
"6": "٦",
"7": "۷",
"8": "۸",
"9": "۹",
"0": "۰",
"q": "ق",
"w": "و",
"e": "ع",
"r": "ر",
"t": "ت",
"y": "ے",
"u": "ء",
"i": "ی",
"o": "ہ",
"p": "پ",
"a": "ا",
"s": "س",
"d": "د",
"f": "ف",
"g": "گ",
"h": "ح",
"j": "ج",
"k": "ک",
"l": "ل",
"z": "ز",
"x": "ش",
"c": "چ",
"v": "ط",
"b": "ب",
"n": "ن",
"m": "م",
"R": "ڑ",
"T": "ٹ",
"Y": "َ",
"U": "ئ",
"I": "ِ",
"P": "ُ",
"A": "آ",
"S": "ص",
"D": "ڈ",
"F": "أ",
"G": "غ",
"H": "ھ",
"J": "ض",
"K": "خ",
"Z": "ذ",
"X": "ژ",
"C": "ث",
"V": "ظ",
"N": "ں",
",": "،",
".": "۔",
"?": "؟",
";": "؛",
},
"hindi": {
"1": "१",
"2": "२",
"3": "३",
"4": "४",
"5": "५",
"6": "६",
"7": "७",
"8": "८",
"9": "९",
"0": "०",
"=": "ृ",
"!": "ऍ",
"@": "ॅ",
"#": "्र",
"$": "र्",
"%": "ज्ञ",
"^": "त्र",
"&": "क्ष",
"*": "श्र",
"_": "ः",
"+": "ऋ",
"q": "ौ",
"w": "ै",
"e": "ा",
"r": "ी",
"t": "ू",
"y": "ब",
"u": "ह",
"i": "ग",
"o": "द",
"p": "ज",
"[": "ड",
"]": "़",
'\\\\': "ॉ",
"Q": "औ",
"W": "ऐ",
"E": "आ",
"R": "ई",
"T": "ऊ",
"Y": "भ",
"U": "ङ",
"I": "घ",
"O": "ध",
"P": "झ",
"{": "ढ",
"}": "ञ",
"|": "ऑ",
"a": "ो",
"s": "े",
"d": "्",
"f": "ि",
"g": "ु",
"h": "प",
"j": "र",
"k": "क",
"l": "त",
";": "च",
"'": "ट",
"A": "ओ",
"S": "ए",
"D": "अ",
"F": "इ",
"G": "उ",
"H": "फ",
"J": "ऱ",
"K": "ख",
"L": "थ",
":": "छ",
'"': "ठ",
"z": "ॆ",
"x": "ं",
"c": "म",
"v": "न",
"b": "व",
"n": "ल",
"m": "स",
".": "।",
"/": "य",
"Z": "ऎ",
"X": "ँ",
"C": "ण",
"V": "ऩ",
"B": "ऴ",
"N": "ळ",
"M": "श",
"<": "ष",
">": "य़",
// "?":"य़",
}
};
function updateTextareaDir(language) {
const sourceTextarea = document.getElementById("source-textbox").querySelector("textarea");
if (rtlLanguages.includes(language)) {
sourceTextarea.setAttribute("dir", "rtl");
} else {
sourceTextarea.setAttribute("dir", "ltr");
}
function keypressHandler(event) {
const key = event.key;
if (keyMap[language].hasOwnProperty(key)) {
event.preventDefault();
const mappedValue = keyMap[language][key];
const start = sourceTextarea.selectionStart;
const end = sourceTextarea.selectionEnd;
sourceTextarea.value = sourceTextarea.value.slice(0, start) + mappedValue + sourceTextarea.value.slice(end);
sourceTextarea.selectionStart = sourceTextarea.selectionEnd = start + mappedValue.length;
}
}
sourceTextarea.removeEventListener("keypress", sourceTextarea.keypressHandler);
sourceTextarea.addEventListener("keypress", keypressHandler);
// Save the handler function to the textarea element for future removal
sourceTextarea.keypressHandler = keypressHandler;
}
</script>
"""
# todo: add dropdown keyboard custom component with key mapping
# todo: output full height
CUSTOM_CSS = """
.reverse-row {
flex-direction: row-reverse;
}
#auto-complete-button {
border-color: var(--button-primary-border-color-hover);
}
"""
HF_TOKEN = os.getenv("HF_TOKEN")
request_logger = (
gradio.HuggingFaceDatasetSaver(
HF_TOKEN,
"sltAI/crowdsourced-text-to-sign-language-rule-based-translation-corpus",
)
if HF_TOKEN
else gradio.CSVLogger()
)
translation_model = slt.models.ConcatenativeSynthesis("ur", "pk-sl", "video")
language_models: Dict[str, slt.models.BeamSampling] = {}
full_to_short = {
"english": "en",
"urdu": "ur",
"hindi": "hi",
}
short_to_full = {s: f for f, s in full_to_short.items()}
def auto_complete_text(model_code: str, text: str):
if model_code not in language_models:
lm = slt.get_model(model_code)
language_models[model_code] = slt.models.BeamSampling(
lm, # type: ignore
start_of_sequence_token=getattr(lm, "start_of_sequence_token", "<"), # type: ignore
end_of_sequence_token=getattr(lm, "end_of_sequence_token", ">"), # type: ignore
)
# todo: better tokenize/detokenize
tokens = [w for w in re.split(r"\b", text) if w]
lm = language_models[model_code]
lm.max_length = len(tokens) + 10
completion, _ = lm.complete(tokens or None)
if completion[0] == lm.start_of_sequence_token: # type: ignore
completion = completion[1:] # type: ignore
if completion[-1] == lm.end_of_sequence_token: # type: ignore
completion = completion[:-1] # type: ignore
new_text = "".join(completion)
return new_text
def text_to_video(
text: str,
text_language: str,
sign_language: str,
sign_format: str = "video",
output_path: str = "output.mp4",
codec="h264", # ToDo: install h264 codec for opencv
):
translation_model.text_language = text_language
translation_model.sign_language = sign_language
translation_model.sign_format = sign_format
if sign_format == "landmarks":
translation_model.sign_embedding_model = "mediapipe-world"
sign = translation_model.translate(text)
if isinstance(sign, slt.Landmarks):
# large hands on sides
# sign.data[:, 33:] *= 2
# sign.data[:, 33:54, 0] += 0.25
# sign.data[:, 54:, 0] -= 0.25
# hands moved to pose wrists
sign.data[:, 33:54, :3] += -sign.data[:, 33:34, :3] + sign.data[:, 15:16, :3]
sign.data[:, 54: , :3] += -sign.data[:, 54:55, :3] + sign.data[:, 16:17, :3]
sign.save_animation(output_path, overwrite=True)
else:
sign.save(output_path, overwrite=True, codec=codec)
# ToDo: video.watermark("Sign Language Translator\nAI Generated Video")
def translate(text: str, text_lang: str, sign_lang: str, sign_format: str):
text_lang = full_to_short.get(text_lang, text_lang)
log = [
text,
text_lang,
sign_lang,
None,
datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f"),
]
try:
if text_lang == "en":
text = text[:1].lower() + text[1:]
path = "output.mp4"
text_to_video(
text,
text_lang,
sign_lang,
sign_format=sign_format,
output_path=path,
codec="mp4v",
)
request_logger.flag(log)
return path
except Exception as exc:
log[3] = str(exc)
request_logger.flag(log)
raise gradio.Error(f"Error during translation: {exc}")
with gradio.Blocks(title=TITLE, head=CUSTOM_JS, css=CUSTOM_CSS) as gradio_app:
gradio.Markdown(f"# {TITLE}")
gradio.Markdown(DESCRIPTION)
with gradio.Row(elem_classes=["reverse-row"]): # Inputs and Outputs
with gradio.Column(): # Outputs
gradio.Markdown("## Output Sign Language")
output_video = gradio.Video(
format="mp4",
label="Synthesized Sign Language Video",
autoplay=True,
show_download_button=True,
include_audio=False,
)
with gradio.Column(): # Inputs
gradio.Markdown("## Select Languages")
with gradio.Row():
text_lang_dropdown = gradio.Dropdown(
choices=[
short_to_full.get(code.value, code.value)
for code in slt.TextLanguageCodes
],
value=short_to_full.get(
slt.TextLanguageCodes.URDU.value,
slt.TextLanguageCodes.URDU.value,
),
label="Text Language",
elem_id="text-lang-dropdown",
)
text_lang_dropdown.change(
None, inputs=text_lang_dropdown, js="updateTextareaDir"
)
sign_lang_dropdown = gradio.Dropdown(
choices=[code.value for code in slt.SignLanguageCodes],
value=slt.SignLanguageCodes.PAKISTAN_SIGN_LANGUAGE.value,
label="Sign Language",
)
output_format_dropdown = gradio.Dropdown(
choices=[
slt.SignFormatCodes.VIDEO.value,
slt.SignFormatCodes.LANDMARKS.value,
],
value=slt.SignFormatCodes.VIDEO.value,
label="Output Format",
)
# todo: sign format: video/landmarks (tabs?)
gradio.Markdown("## Input Text")
with gradio.Row(): # Source TextArea
source_textbox = gradio.Textbox(
lines=4,
placeholder="Enter Text Here...",
label="Spoken Language Sentence",
show_copy_button=True,
elem_id="source-textbox",
)
with gradio.Row(): # clear/auto-complete/Language Model
language_model_dropdown = gradio.Dropdown(
choices=[
slt.ModelCodes.MIXER_LM_NGRAM_URDU.value,
slt.ModelCodes.TRANSFORMER_LM_UR_SUPPORTED.value,
],
value=slt.ModelCodes.MIXER_LM_NGRAM_URDU.value,
label="Select language model to Generate sample text",
)
auto_complete_button = gradio.Button(
"Auto-Complete", elem_id="auto-complete-button"
)
auto_complete_button.click(
auto_complete_text,
inputs=[language_model_dropdown, source_textbox],
outputs=[source_textbox],
api_name=False,
)
clear_button = gradio.ClearButton(source_textbox, api_name=False)
with gradio.Row(): # Translate Button
translate_button = gradio.Button("Translate", variant="primary")
translate_button.click(
translate,
inputs=[
source_textbox,
text_lang_dropdown,
sign_lang_dropdown,
output_format_dropdown,
],
outputs=[output_video],
api_name="translate",
)
gradio.Examples(
[
["We are here to use this.", "english", "pakistan-sign-language", "video"],
["i(me) admire art.", "english", "pakistan-sign-language", "landmarks"],
["یہ بہت اچھا ہے۔", "urdu", "pakistan-sign-language", "video"],
["وہ کام آسان تھا۔", "urdu", "pakistan-sign-language", "landmarks"],
["कैसे हैं आप?", "hindi", "pakistan-sign-language", "video"],
["पाँच घंटे।", "hindi", "pakistan-sign-language", "landmarks"],
],
inputs=[
source_textbox,
text_lang_dropdown,
sign_lang_dropdown,
output_format_dropdown,
],
outputs=output_video,
)
request_logger.setup(
[
source_textbox,
text_lang_dropdown,
sign_lang_dropdown,
gradio.Markdown(label="Exception"),
gradio.Markdown(label="Timestamp"),
],
"flagged",
)
gradio_app.load(None, inputs=[text_lang_dropdown], js="updateTextareaDir")
if __name__ == "__main__":
gradio_app.launch()