mgokg commited on
Commit
f32d50e
·
verified ·
1 Parent(s): c89afdc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -17
app.py CHANGED
@@ -62,34 +62,65 @@ class GeminiHandler(AsyncAudioVideoStreamHandler):
62
  api_key=os.getenv("GEMINI_API_KEY"), http_options={"api_version": "v1alpha"}
63
  )
64
  config = {"response_modalities": ["AUDIO"]}
 
 
 
 
 
 
 
 
 
65
  async with client.aio.live.connect(
66
  model="gemini-2.0-flash-exp",
67
  config=config, # type: ignore
68
  ) as session:
69
  self.session = session
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  while not self.quit.is_set():
71
  turn = self.session.receive()
72
  try:
73
  async for response in turn:
74
- if data := response.data:
75
  audio = np.frombuffer(data, dtype=np.int16).reshape(1, -1)
76
- self.audio_queue.put_nowait(audio)
 
 
77
  except websockets.exceptions.ConnectionClosedOK:
78
- print("connection closed")
79
  break
 
 
 
80
 
81
  async def video_receive(self, frame: np.ndarray):
82
  self.video_queue.put_nowait(frame)
83
 
84
  if self.session:
85
  # send image every 1 second
86
- print(time.time() - self.last_frame_time)
87
  if time.time() - self.last_frame_time > 1:
88
  self.last_frame_time = time.time()
89
  await self.session.send(input=encode_image(frame))
90
- if self.latest_args[1] is not None:
 
91
  await self.session.send(input=encode_image(self.latest_args[1]))
92
 
 
93
  async def video_emit(self):
94
  frame = await wait_for_item(self.video_queue, 0.01)
95
  if frame is not None:
@@ -108,13 +139,18 @@ class GeminiHandler(AsyncAudioVideoStreamHandler):
108
  array = await wait_for_item(self.audio_queue, 0.01)
109
  if array is not None:
110
  return (self.output_sample_rate, array)
111
- return array
112
 
113
  async def shutdown(self) -> None:
114
  if self.session:
115
  self.quit.set()
116
- await self.session.close()
117
- self.quit.clear()
 
 
 
 
 
118
 
119
 
120
  stream = Stream(
@@ -146,7 +182,7 @@ with gr.Blocks(css=css) as demo:
146
  <h1>Gen AI Voice Chat</h1>
147
  <p>real-time audio streaming</p>
148
  </center>
149
- </div>
150
  """
151
  )
152
  with gr.Row() as row:
@@ -161,14 +197,19 @@ with gr.Blocks(css=css) as demo:
161
  pulse_color="rgb(255, 255, 255)",
162
  icon_button_color="rgb(255, 255, 255)",
163
  )
164
- #with gr.Column():
165
- #image_input = gr.Image(
166
- #label="Image", type="numpy", sources=["upload", "clipboard"]
167
- #)
168
-
 
 
 
 
 
169
  webrtc.stream(
170
- GeminiHandler(),
171
- inputs=[webrtc],
172
  outputs=[webrtc],
173
  time_limit=180 if get_space() else None,
174
  concurrency_limit=2 if get_space() else None,
@@ -183,4 +224,4 @@ if __name__ == "__main__":
183
  elif mode == "PHONE":
184
  raise ValueError("Phone mode not supported for this demo")
185
  else:
186
- stream.ui.launch(server_port=7860)
 
62
  api_key=os.getenv("GEMINI_API_KEY"), http_options={"api_version": "v1alpha"}
63
  )
64
  config = {"response_modalities": ["AUDIO"]}
65
+
66
+ # --- System Message Definition ---
67
+ # You can customize this message to set the context for the AI.
68
+ system_message = (
69
+ "du bist ein echtzeitübersetzer für deutsch auf italienisch und italienisch auf deutsch. erläre nicht, kommentiere nicht, füge nichts hinzu, nur übersetzen"
70
+
71
+ )
72
+ # --- End of System Message Definition ---
73
+
74
  async with client.aio.live.connect(
75
  model="gemini-2.0-flash-exp",
76
  config=config, # type: ignore
77
  ) as session:
78
  self.session = session
79
+
80
+ # --- Send the System Message ---
81
+ if system_message:
82
+ print(f"Sending system message to Gemini: '{system_message}'")
83
+ try:
84
+ # Send the system message as the first input to the model
85
+ await self.session.send(input=system_message)
86
+ # Note: The model might provide an audio response to this system message,
87
+ # which will be handled by the loop below.
88
+ # If this initial audio response is not desired, further handling might be needed,
89
+ # or the system message phrased to not elicit a direct spoken reply.
90
+ except Exception as e:
91
+ print(f"Error sending system message: {e}")
92
+ # --- End of Sending System Message ---
93
+
94
  while not self.quit.is_set():
95
  turn = self.session.receive()
96
  try:
97
  async for response in turn:
98
+ if data := response.data: # Assumes response.data contains audio
99
  audio = np.frombuffer(data, dtype=np.int16).reshape(1, -1)
100
+ self.audio_queue.put_nowait(audio)
101
+ # If you expect other types of responses (e.g., text parts) from the model,
102
+ # you would need to handle `response.text` or other fields here.
103
  except websockets.exceptions.ConnectionClosedOK:
104
+ print("Connection closed by server.")
105
  break
106
+ except Exception as e:
107
+ print(f"Error during session receive: {e}")
108
+ break # Or implement more robust error handling
109
 
110
  async def video_receive(self, frame: np.ndarray):
111
  self.video_queue.put_nowait(frame)
112
 
113
  if self.session:
114
  # send image every 1 second
115
+ # print(time.time() - self.last_frame_time) # For debugging frame send rate
116
  if time.time() - self.last_frame_time > 1:
117
  self.last_frame_time = time.time()
118
  await self.session.send(input=encode_image(frame))
119
+ # Check if additional image input is provided and send it
120
+ if len(self.latest_args) > 1 and self.latest_args[1] is not None:
121
  await self.session.send(input=encode_image(self.latest_args[1]))
122
 
123
+
124
  async def video_emit(self):
125
  frame = await wait_for_item(self.video_queue, 0.01)
126
  if frame is not None:
 
139
  array = await wait_for_item(self.audio_queue, 0.01)
140
  if array is not None:
141
  return (self.output_sample_rate, array)
142
+ return array # Returns None if no item, which is handled by Gradio
143
 
144
  async def shutdown(self) -> None:
145
  if self.session:
146
  self.quit.set()
147
+ try:
148
+ await self.session.close()
149
+ except Exception as e:
150
+ print(f"Error closing session: {e}")
151
+ finally:
152
+ self.session = None # Ensure session is cleared
153
+ self.quit.clear()
154
 
155
 
156
  stream = Stream(
 
182
  <h1>Gen AI Voice Chat</h1>
183
  <p>real-time audio streaming</p>
184
  </center>
185
+ </div>
186
  """
187
  )
188
  with gr.Row() as row:
 
197
  pulse_color="rgb(255, 255, 255)",
198
  icon_button_color="rgb(255, 255, 255)",
199
  )
200
+ # The additional_inputs in Stream handles the image input now.
201
+ # If you need a separate gr.Image component for other purposes, you can uncomment it.
202
+ # with gr.Column():
203
+ # image_input = gr.Image(
204
+ # label="Image", type="numpy", sources=["upload", "clipboard"]
205
+ # )
206
+
207
+ # The stream method now correctly uses the handler instance from the Stream object
208
+ # and correctly wires up inputs and outputs.
209
+ # The additional_inputs (like an image) are passed via self.latest_args in the handler.
210
  webrtc.stream(
211
+ stream.handler_instance, # Use the handler instance from the Stream object
212
+ inputs=[webrtc] + stream.additional_inputs_queue_proxies, # Pass webrtc and additional inputs
213
  outputs=[webrtc],
214
  time_limit=180 if get_space() else None,
215
  concurrency_limit=2 if get_space() else None,
 
224
  elif mode == "PHONE":
225
  raise ValueError("Phone mode not supported for this demo")
226
  else:
227
+ stream.ui.launch(server_port=7860)