iSushant commited on
Commit
e039b71
·
verified ·
1 Parent(s): 9963dcc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -49
app.py CHANGED
@@ -3,70 +3,60 @@ import google.generativeai as genai
3
  from PIL import Image
4
  import io
5
  from gtts import gTTS
6
- import base64
7
 
8
- # Configure Gemini API
9
  genai.configure(api_key="AIzaSyB6JYzYNfi8ak7g6526raHQ08YPMiC5Wic")
10
 
11
- def get_gemini_context():
12
- return """
13
- Focus on identifying and interpreting sign language gestures:
14
- 1. Look for hand shapes and finger positions
15
- 2. Identify any American Sign Language (ASL) letters or numbers
16
- 3. Recognize common ASL gestures and signs
17
- 4. Provide clear, direct interpretation
18
- 5. If unsure, describe the hand position/gesture seen
19
- Keep responses brief and focused on the sign's meaning.
20
- """
21
-
22
- def get_gemini_response_text(response):
23
- if hasattr(response, 'prompt_feedback') and response.prompt_feedback and getattr(response.prompt_feedback, 'block_reason', None):
24
- return None
25
- if hasattr(response, 'candidates'):
26
- for candidate in response.candidates:
27
- if hasattr(candidate, 'content') and candidate.content and hasattr(candidate.content, 'parts'):
28
- full_text = "".join(part.text for part in candidate.content.parts if hasattr(part, 'text'))
29
- if full_text.strip():
30
- return full_text
31
- if hasattr(response, 'text') and response.text and response.text.strip():
32
- return response.text
33
- return None
34
-
35
  def interpret_sign(image: Image.Image):
 
 
 
36
  try:
37
- # Get Gemini interpretation
 
 
 
 
 
 
38
  model = genai.GenerativeModel(model_name='gemini-1.5-flash-latest')
39
- prompt = f"{get_gemini_context()}\n\nWhat sign or gesture is being shown in this image? Provide a clear, concise interpretation."
40
  response = model.generate_content([prompt, image])
41
- text_response = get_gemini_response_text(response)
42
- if not text_response:
 
 
 
 
 
 
 
 
 
 
 
43
  return "Could not interpret the sign.", None
44
 
45
- # Generate audio
46
- tts = gTTS(text=text_response, lang='en', slow=False)
47
  audio_fp = io.BytesIO()
48
  tts.write_to_fp(audio_fp)
49
  audio_fp.seek(0)
50
- audio_bytes = audio_fp.read()
51
- return text_response, (audio_bytes, "audio/mp3")
52
  except Exception as e:
53
  return f"Error: {str(e)}", None
54
 
55
- with gr.Blocks(title="Sign Language Interpreter") as demo:
56
- gr.Markdown("# Sign Language Interpreter\nUpload or capture an image of a sign language gesture to get an interpretation and audio.")
57
- with gr.Row():
58
- with gr.Column():
59
- image_input = gr.Image(label="Sign Image", sources=["upload", "webcam"], type="pil")
60
- submit_btn = gr.Button("Interpret Sign")
61
- with gr.Column():
62
- text_output = gr.Textbox(label="Interpretation")
63
- audio_output = gr.Audio(label="Audio", type="filepath")
64
-
65
- submit_btn.click(
66
- interpret_sign,
67
- inputs=image_input,
68
- outputs=[text_output, audio_output]
69
- )
70
 
71
  if __name__ == "__main__":
72
  demo.launch()
 
3
  from PIL import Image
4
  import io
5
  from gtts import gTTS
 
6
 
7
+ # Configure Gemini API (hardcoded as you requested)
8
  genai.configure(api_key="AIzaSyB6JYzYNfi8ak7g6526raHQ08YPMiC5Wic")
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  def interpret_sign(image: Image.Image):
11
+ if image is None:
12
+ return "Please upload or capture an image.", None
13
+
14
  try:
15
+ # Prepare prompt
16
+ prompt = (
17
+ "Focus on identifying and interpreting sign language gestures. "
18
+ "What sign or gesture is being shown in this image? Provide a clear, concise interpretation."
19
+ )
20
+
21
+ # Call Gemini API
22
  model = genai.GenerativeModel(model_name='gemini-1.5-flash-latest')
 
23
  response = model.generate_content([prompt, image])
24
+
25
+ # Extract response text
26
+ text = None
27
+ if hasattr(response, "candidates"):
28
+ for candidate in response.candidates:
29
+ if hasattr(candidate, "content") and candidate.content and hasattr(candidate.content, "parts"):
30
+ parts = candidate.content.parts
31
+ text = "".join(part.text for part in parts if hasattr(part, "text"))
32
+ if text.strip():
33
+ break
34
+ if not text:
35
+ text = getattr(response, "text", None)
36
+ if not text or not text.strip():
37
  return "Could not interpret the sign.", None
38
 
39
+ # Generate TTS audio
40
+ tts = gTTS(text=text, lang='en', slow=False)
41
  audio_fp = io.BytesIO()
42
  tts.write_to_fp(audio_fp)
43
  audio_fp.seek(0)
44
+
45
+ return text, (audio_fp, "audio/mp3")
46
  except Exception as e:
47
  return f"Error: {str(e)}", None
48
 
49
+ # Gradio UI
50
+ demo = gr.Interface(
51
+ fn=interpret_sign,
52
+ inputs=gr.Image(label="Sign Image", sources=["upload", "webcam"], type="pil"),
53
+ outputs=[
54
+ gr.Textbox(label="Interpretation"),
55
+ gr.Audio(label="Audio", type="filepath")
56
+ ],
57
+ title="Sign Language Interpreter",
58
+ description="Upload or capture an image of a sign language gesture. The app will interpret the sign and provide an audio output."
59
+ )
 
 
 
 
60
 
61
  if __name__ == "__main__":
62
  demo.launch()