Spaces:

iSushant
/

API

Sleeping

App Files Files Community

iSushant commited on May 27

Commit

e039b71

verified ·

1 Parent(s): 9963dcc

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -49

app.py CHANGED Viewed

@@ -3,70 +3,60 @@ import google.generativeai as genai
 from PIL import Image
 import io
 from gtts import gTTS
-import base64
-# Configure Gemini API
 genai.configure(api_key="AIzaSyB6JYzYNfi8ak7g6526raHQ08YPMiC5Wic")
-def get_gemini_context():
-    return """
-        Focus on identifying and interpreting sign language gestures:
-        1. Look for hand shapes and finger positions
-        2. Identify any American Sign Language (ASL) letters or numbers
-        3. Recognize common ASL gestures and signs
-        4. Provide clear, direct interpretation
-        5. If unsure, describe the hand position/gesture seen
-        Keep responses brief and focused on the sign's meaning.
-        """
-def get_gemini_response_text(response):
-    if hasattr(response, 'prompt_feedback') and response.prompt_feedback and getattr(response.prompt_feedback, 'block_reason', None):
-        return None
-    if hasattr(response, 'candidates'):
-        for candidate in response.candidates:
-            if hasattr(candidate, 'content') and candidate.content and hasattr(candidate.content, 'parts'):
-                full_text = "".join(part.text for part in candidate.content.parts if hasattr(part, 'text'))
-                if full_text.strip():
-                    return full_text
-    if hasattr(response, 'text') and response.text and response.text.strip():
-        return response.text
-    return None
 def interpret_sign(image: Image.Image):
     try:
-        # Get Gemini interpretation
         model = genai.GenerativeModel(model_name='gemini-1.5-flash-latest')
-        prompt = f"{get_gemini_context()}\n\nWhat sign or gesture is being shown in this image? Provide a clear, concise interpretation."
         response = model.generate_content([prompt, image])
-        text_response = get_gemini_response_text(response)
-        if not text_response:
             return "Could not interpret the sign.", None
-        # Generate audio
-        tts = gTTS(text=text_response, lang='en', slow=False)
         audio_fp = io.BytesIO()
         tts.write_to_fp(audio_fp)
         audio_fp.seek(0)
-        audio_bytes = audio_fp.read()
-        return text_response, (audio_bytes, "audio/mp3")
     except Exception as e:
         return f"Error: {str(e)}", None
-with gr.Blocks(title="Sign Language Interpreter") as demo:
-    gr.Markdown("# Sign Language Interpreter\nUpload or capture an image of a sign language gesture to get an interpretation and audio.")
-    with gr.Row():
-        with gr.Column():
-            image_input = gr.Image(label="Sign Image", sources=["upload", "webcam"], type="pil")
-            submit_btn = gr.Button("Interpret Sign")
-        with gr.Column():
-            text_output = gr.Textbox(label="Interpretation")
-            audio_output = gr.Audio(label="Audio", type="filepath")
-    submit_btn.click(
-        interpret_sign,
-        inputs=image_input,
-        outputs=[text_output, audio_output]
-    )
 if __name__ == "__main__":
     demo.launch()

 from PIL import Image
 import io
 from gtts import gTTS
+# Configure Gemini API (hardcoded as you requested)
 genai.configure(api_key="AIzaSyB6JYzYNfi8ak7g6526raHQ08YPMiC5Wic")
 def interpret_sign(image: Image.Image):
+    if image is None:
+        return "Please upload or capture an image.", None
     try:
+        # Prepare prompt
+        prompt = (
+            "Focus on identifying and interpreting sign language gestures. "
+            "What sign or gesture is being shown in this image? Provide a clear, concise interpretation."
+        )
+        # Call Gemini API
         model = genai.GenerativeModel(model_name='gemini-1.5-flash-latest')
         response = model.generate_content([prompt, image])
+        # Extract response text
+        text = None
+        if hasattr(response, "candidates"):
+            for candidate in response.candidates:
+                if hasattr(candidate, "content") and candidate.content and hasattr(candidate.content, "parts"):
+                    parts = candidate.content.parts
+                    text = "".join(part.text for part in parts if hasattr(part, "text"))
+                    if text.strip():
+                        break
+        if not text:
+            text = getattr(response, "text", None)
+        if not text or not text.strip():
             return "Could not interpret the sign.", None
+        # Generate TTS audio
+        tts = gTTS(text=text, lang='en', slow=False)
         audio_fp = io.BytesIO()
         tts.write_to_fp(audio_fp)
         audio_fp.seek(0)
+        return text, (audio_fp, "audio/mp3")
     except Exception as e:
         return f"Error: {str(e)}", None
+# Gradio UI
+demo = gr.Interface(
+    fn=interpret_sign,
+    inputs=gr.Image(label="Sign Image", sources=["upload", "webcam"], type="pil"),
+    outputs=[
+        gr.Textbox(label="Interpretation"),
+        gr.Audio(label="Audio", type="filepath")
+    ],
+    title="Sign Language Interpreter",
+    description="Upload or capture an image of a sign language gesture. The app will interpret the sign and provide an audio output."
+)
 if __name__ == "__main__":
     demo.launch()