Spaces:
Sleeping
Sleeping
import os | |
import sys | |
import gradio as gr | |
import requests | |
import json | |
from datetime import datetime | |
import tempfile | |
import uuid | |
import re | |
# Install required packages if not already installed | |
try: | |
import mediapipe as mp | |
import cv2 | |
import numpy as np | |
except ImportError: | |
print("Installing required packages...") | |
os.system("pip install mediapipe opencv-python numpy --quiet") | |
import mediapipe as mp | |
import cv2 | |
import numpy as np | |
TITLE = "Multilingual Sign Language Customer Assistant" | |
DESCRIPTION = """This app translates English or Arabic text into sign language videos for customer assistance. | |
The system automatically detects the input language and generates appropriate sign language visuals. | |
**Features:** | |
- Supports both English and Arabic text | |
- Uses 3D avatar technology to generate sign language | |
- Perfect for customer service and assistance scenarios | |
""" | |
# Initialize MediaPipe | |
mp_hands = mp.solutions.hands | |
mp_drawing = mp.solutions.drawing_utils | |
mp_drawing_styles = mp.solutions.drawing_styles | |
mp_pose = mp.solutions.pose | |
# Dictionary of translations for common customer service phrases | |
TRANSLATIONS = { | |
"hello": "مرحبا", | |
"welcome": "أهلا وسهلا", | |
"thank you": "شكرا", | |
"help": "مساعدة", | |
"yes": "نعم", | |
"no": "لا", | |
"please": "من فضلك", | |
"wait": "انتظر", | |
"sorry": "آسف", | |
"how can i help you": "كيف يمكنني مساعدتك", | |
"customer": "عميل", | |
"service": "خدمة", | |
"support": "دعم", | |
"information": "معلومات", | |
"question": "سؤال", | |
"answer": "إجابة", | |
} | |
# SignDict - dictionary of common signs in both languages | |
# In a production app, these would link to pre-recorded videos or 3D animations | |
SIGN_DICT = { | |
"en": { | |
"hello": "signs/en/hello.mp4", | |
"welcome": "signs/en/welcome.mp4", | |
"thank you": "signs/en/thank_you.mp4", | |
"help": "signs/en/help.mp4", | |
"yes": "signs/en/yes.mp4", | |
"no": "signs/en/no.mp4", | |
"please": "signs/en/please.mp4", | |
"wait": "signs/en/wait.mp4", | |
"sorry": "signs/en/sorry.mp4", | |
"how": "signs/en/how.mp4", | |
"what": "signs/en/what.mp4", | |
"where": "signs/en/where.mp4", | |
"when": "signs/en/when.mp4", | |
"who": "signs/en/who.mp4", | |
"why": "signs/en/why.mp4", | |
"customer": "signs/en/customer.mp4", | |
"service": "signs/en/service.mp4", | |
"support": "signs/en/support.mp4", | |
"information": "signs/en/information.mp4", | |
"question": "signs/en/question.mp4", | |
"answer": "signs/en/answer.mp4", | |
}, | |
"ar": { | |
"مرحبا": "signs/ar/hello.mp4", | |
"أهلا وسهلا": "signs/ar/welcome.mp4", | |
"شكرا": "signs/ar/thank_you.mp4", | |
"مساعدة": "signs/ar/help.mp4", | |
"نعم": "signs/ar/yes.mp4", | |
"لا": "signs/ar/no.mp4", | |
"من فضلك": "signs/ar/please.mp4", | |
"انتظر": "signs/ar/wait.mp4", | |
"آسف": "signs/ar/sorry.mp4", | |
"كيف": "signs/ar/how.mp4", | |
"ماذا": "signs/ar/what.mp4", | |
"أين": "signs/ar/where.mp4", | |
"متى": "signs/ar/when.mp4", | |
"من": "signs/ar/who.mp4", | |
"لماذا": "signs/ar/why.mp4", | |
"عميل": "signs/ar/customer.mp4", | |
"خدمة": "signs/ar/service.mp4", | |
"دعم": "signs/ar/support.mp4", | |
"معلومات": "signs/ar/information.mp4", | |
"سؤال": "signs/ar/question.mp4", | |
"إجابة": "signs/ar/answer.mp4", | |
} | |
} | |
def detect_language(text): | |
"""Detect if text is primarily English or Arabic""" | |
if not text: | |
return "unknown" | |
# Simple detection by character set | |
arabic_chars = set('ءآأؤإئابةتثجحخدذرزسشصضطظعغفقكلمنهوي') | |
english_chars = set('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ') | |
arabic_count = sum(1 for char in text if char in arabic_chars) | |
english_count = sum(1 for char in text if char in english_chars) | |
if arabic_count > english_count: | |
return "ar" | |
elif english_count > 0: | |
return "en" | |
else: | |
return "unknown" | |
def translate_text(text, source_lang, target_lang): | |
"""Simple dictionary-based translation""" | |
if source_lang == target_lang: | |
return text | |
# Convert to lowercase for matching | |
text_lower = text.lower() | |
# For English to Arabic | |
if source_lang == "en" and target_lang == "ar": | |
for eng, ar in TRANSLATIONS.items(): | |
text_lower = text_lower.replace(eng, ar) | |
return text_lower | |
# For Arabic to English | |
if source_lang == "ar" and target_lang == "en": | |
for eng, ar in TRANSLATIONS.items(): | |
text_lower = text_lower.replace(ar, eng) | |
return text_lower | |
return text # Return original if no translation path | |
def tokenize_text(text, language): | |
"""Split text into tokens that can be matched to signs""" | |
if language == "ar": | |
# Arabic tokenization | |
tokens = text.split() | |
# Check for phrases | |
phrases = [] | |
i = 0 | |
while i < len(tokens): | |
# Try to match longest phrases first | |
matched = False | |
for j in range(min(3, len(tokens) - i), 0, -1): | |
phrase = " ".join(tokens[i:i+j]) | |
if phrase in SIGN_DICT[language]: | |
phrases.append(phrase) | |
i += j | |
matched = True | |
break | |
if not matched: | |
phrases.append(tokens[i]) | |
i += 1 | |
return phrases | |
else: | |
# English tokenization | |
tokens = text.lower().split() | |
phrases = [] | |
i = 0 | |
while i < len(tokens): | |
matched = False | |
for j in range(min(3, len(tokens) - i), 0, -1): | |
phrase = " ".join(tokens[i:i+j]) | |
if phrase in SIGN_DICT[language]: | |
phrases.append(phrase) | |
i += j | |
matched = True | |
break | |
if not matched: | |
phrases.append(tokens[i]) | |
i += 1 | |
return phrases | |
def generate_default_sign_video(text, output_path, language="en"): | |
"""Generate a simple video with the text when no sign is available""" | |
# Create a black frame with text | |
height, width = 480, 640 | |
fps = 30 | |
seconds = 2 | |
# Create a VideoWriter object | |
fourcc = cv2.VideoWriter_fourcc(*'mp4v') | |
video = cv2.VideoWriter(output_path, fourcc, fps, (width, height)) | |
# Create frames with text | |
font = cv2.FONT_HERSHEY_SIMPLEX | |
font_scale = 1 | |
font_color = (255, 255, 255) # White | |
line_type = 2 | |
# Text positioning | |
text_size = cv2.getTextSize(text, font, font_scale, line_type)[0] | |
text_x = (width - text_size[0]) // 2 | |
text_y = (height + text_size[1]) // 2 | |
# Write frames | |
for _ in range(fps * seconds): | |
frame = np.zeros((height, width, 3), dtype=np.uint8) | |
cv2.putText(frame, text, (text_x, text_y), font, font_scale, font_color, line_type) | |
video.write(frame) | |
video.release() | |
return output_path | |
def create_avatar_animation(text, output_path, language="en", style="3D"): | |
"""Create a 3D avatar animation for the sign (simplified version)""" | |
width, height = 640, 480 | |
fps = 30 | |
duration = 3 # seconds | |
# Create video writer | |
fourcc = cv2.VideoWriter_fourcc(*'mp4v') | |
video = cv2.VideoWriter(output_path, fourcc, fps, (width, height)) | |
# Create a simple animation with hands | |
frames = fps * duration | |
for i in range(frames): | |
# Create a background based on style | |
if style == "3D": | |
# Create a gradient background | |
frame = np.zeros((height, width, 3), dtype=np.uint8) | |
for y in range(height): | |
for x in range(width): | |
frame[y, x] = [ | |
int(100 + 50 * (x / width)), | |
int(60 + 30 * (y / height)), | |
int(120 + 40 * ((x+y) / (width+height))) | |
] | |
else: | |
# Simple solid background for 2D | |
frame = np.ones((height, width, 3), dtype=np.uint8) * np.array([240, 240, 240], dtype=np.uint8) | |
# Draw a simple avatar | |
if style == "3D": | |
# 3D-style avatar | |
# Body | |
cv2.rectangle(frame, (width//2-50, height//2-100), (width//2+50, height//2+100), (200, 200, 200), -1) | |
# Head | |
cv2.circle(frame, (width//2, height//2-150), 50, (200, 200, 200), -1) | |
# Animate hands based on frame number | |
t = i / frames | |
# Left hand movement | |
x1 = int(width//2 - 100 - 50 * np.sin(t * 2 * np.pi)) | |
y1 = int(height//2 - 50 * np.cos(t * 2 * np.pi)) | |
# Right hand movement | |
x2 = int(width//2 + 100 + 50 * np.sin(t * 2 * np.pi)) | |
y2 = int(height//2 - 50 * np.cos(t * 2 * np.pi)) | |
# Draw hands | |
cv2.circle(frame, (x1, y1), 20, (200, 200, 200), -1) | |
cv2.circle(frame, (x2, y2), 20, (200, 200, 200), -1) | |
else: | |
# 2D-style signing | |
# Drawing a simplified 2D signer | |
cv2.line(frame, (width//2, height//2-100), (width//2, height//2+50), (0, 0, 0), 3) # Body | |
cv2.circle(frame, (width//2, height//2-120), 20, (0, 0, 0), 2) # Head | |
# Animated hands for signing | |
t = i / frames | |
angle1 = t * 2 * np.pi | |
angle2 = t * 2 * np.pi + np.pi/2 | |
# Left arm | |
x1 = int(width//2) | |
y1 = int(height//2 - 70) | |
x2 = int(x1 - 60 * np.cos(angle1)) | |
y2 = int(y1 + 60 * np.sin(angle1)) | |
cv2.line(frame, (x1, y1), (x2, y2), (0, 0, 0), 2) | |
# Right arm | |
x3 = int(width//2) | |
y3 = int(height//2 - 70) | |
x4 = int(x3 + 60 * np.cos(angle2)) | |
y4 = int(y3 + 60 * np.sin(angle2)) | |
cv2.line(frame, (x3, y3), (x4, y4), (0, 0, 0), 2) | |
# Add text with current sign | |
font = cv2.FONT_HERSHEY_SIMPLEX | |
cv2.putText(frame, text, (width//2-100, height-50), font, 1, (0, 0, 0), 2) | |
if language == "ar": | |
# Right-to-left indicator | |
cv2.putText(frame, "RTL", (width-70, 30), font, 0.7, (0, 0, 0), 1) | |
video.write(frame) | |
video.release() | |
return output_path | |
def generate_sign_video(tokens, language, output_format="3D"): | |
"""Generate sign language video for the given tokens""" | |
# For each token, either find a pre-recorded video or generate one | |
temp_dir = tempfile.gettempdir() | |
output_path = os.path.join(temp_dir, f"sign_output_{uuid.uuid4()}.mp4") | |
# In a real implementation, this would concatenate actual sign videos | |
# For this demo, we'll create a simple animation | |
if language in SIGN_DICT and tokens and tokens[0] in SIGN_DICT[language]: | |
# In a real implementation, this would load the video file | |
# For demo purposes, we'll create an animation | |
create_avatar_animation(tokens[0], output_path, language, output_format) | |
else: | |
# Generate a default video with text | |
if tokens: | |
create_avatar_animation(tokens[0], output_path, language, output_format) | |
else: | |
create_avatar_animation("No tokens", output_path, language, output_format) | |
return output_path | |
def translate_to_sign(text, output_format="3D"): | |
"""Main function to translate text to sign language video""" | |
if not text: | |
return None, "" | |
# Detect the input language | |
language = detect_language(text) | |
if language == "unknown": | |
return None, "Could not determine the language. Please use English or Arabic." | |
try: | |
# Tokenize the text | |
tokens = tokenize_text(text, language) | |
if not tokens: | |
return None, "No translatable tokens found." | |
# Generate sign language video | |
video_path = generate_sign_video(tokens, language, output_format) | |
# Prepare status message | |
if language == "en": | |
status = f"Translated English: \"{text}\" to sign language." | |
else: | |
status = f"Translated Arabic: \"{text}\" to sign language." | |
return video_path, status | |
except Exception as e: | |
error_msg = str(e) | |
print(f"Error during translation: {error_msg}") | |
return None, f"Error during translation: {error_msg}" | |
# Create the Gradio interface | |
with gr.Blocks(title=TITLE) as demo: | |
gr.Markdown(f"# {TITLE}") | |
gr.Markdown(DESCRIPTION) | |
with gr.Row(): | |
with gr.Column(): | |
# Input area | |
text_input = gr.Textbox( | |
lines=4, | |
placeholder="Enter English or Arabic text here...", | |
label="Text Input" | |
) | |
format_dropdown = gr.Dropdown( | |
choices=["3D", "2D"], | |
value="3D", | |
label="Avatar Style" | |
) | |
with gr.Row(): | |
clear_btn = gr.Button("Clear") | |
translate_btn = gr.Button("Translate to Sign Language", variant="primary") | |
# Status area | |
status_output = gr.Textbox(label="Status", interactive=False) | |
with gr.Column(): | |
# Output video | |
video_output = gr.Video( | |
label="Sign Language Output", | |
format="mp4", | |
autoplay=True, | |
show_download_button=True | |
) | |
# Examples in both languages | |
gr.Examples( | |
examples=[ | |
["Hello, how can I help you today?", "3D"], | |
["Please wait while I check your account.", "3D"], | |
["Thank you for your patience.", "3D"], | |
["مرحبا، كيف يمكنني مساعدتك اليوم؟", "3D"], | |
["من فضلك انتظر بينما أتحقق من حسابك.", "3D"], | |
["شكرا لصبرك.", "3D"] | |
], | |
inputs=[text_input, format_dropdown], | |
outputs=[video_output, status_output], | |
fn=translate_to_sign | |
) | |
# Event handlers | |
translate_btn.click( | |
fn=translate_to_sign, | |
inputs=[text_input, format_dropdown], | |
outputs=[video_output, status_output] | |
) | |
clear_btn.click( | |
fn=lambda: ("", "Input cleared"), | |
inputs=None, | |
outputs=[text_input, status_output] | |
) | |
# Launch the app | |
if __name__ == "__main__": | |
demo.launch() |