mgokg commited on
Commit
bbf9a20
·
verified ·
1 Parent(s): f32d50e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -58
app.py CHANGED
@@ -62,65 +62,34 @@ class GeminiHandler(AsyncAudioVideoStreamHandler):
62
  api_key=os.getenv("GEMINI_API_KEY"), http_options={"api_version": "v1alpha"}
63
  )
64
  config = {"response_modalities": ["AUDIO"]}
65
-
66
- # --- System Message Definition ---
67
- # You can customize this message to set the context for the AI.
68
- system_message = (
69
- "du bist ein echtzeitübersetzer für deutsch auf italienisch und italienisch auf deutsch. erläre nicht, kommentiere nicht, füge nichts hinzu, nur übersetzen"
70
-
71
- )
72
- # --- End of System Message Definition ---
73
-
74
  async with client.aio.live.connect(
75
  model="gemini-2.0-flash-exp",
76
  config=config, # type: ignore
77
  ) as session:
78
  self.session = session
79
-
80
- # --- Send the System Message ---
81
- if system_message:
82
- print(f"Sending system message to Gemini: '{system_message}'")
83
- try:
84
- # Send the system message as the first input to the model
85
- await self.session.send(input=system_message)
86
- # Note: The model might provide an audio response to this system message,
87
- # which will be handled by the loop below.
88
- # If this initial audio response is not desired, further handling might be needed,
89
- # or the system message phrased to not elicit a direct spoken reply.
90
- except Exception as e:
91
- print(f"Error sending system message: {e}")
92
- # --- End of Sending System Message ---
93
-
94
  while not self.quit.is_set():
95
  turn = self.session.receive()
96
  try:
97
  async for response in turn:
98
- if data := response.data: # Assumes response.data contains audio
99
  audio = np.frombuffer(data, dtype=np.int16).reshape(1, -1)
100
- self.audio_queue.put_nowait(audio)
101
- # If you expect other types of responses (e.g., text parts) from the model,
102
- # you would need to handle `response.text` or other fields here.
103
  except websockets.exceptions.ConnectionClosedOK:
104
- print("Connection closed by server.")
105
  break
106
- except Exception as e:
107
- print(f"Error during session receive: {e}")
108
- break # Or implement more robust error handling
109
 
110
  async def video_receive(self, frame: np.ndarray):
111
  self.video_queue.put_nowait(frame)
112
 
113
  if self.session:
114
  # send image every 1 second
115
- # print(time.time() - self.last_frame_time) # For debugging frame send rate
116
  if time.time() - self.last_frame_time > 1:
117
  self.last_frame_time = time.time()
118
  await self.session.send(input=encode_image(frame))
119
- # Check if additional image input is provided and send it
120
- if len(self.latest_args) > 1 and self.latest_args[1] is not None:
121
  await self.session.send(input=encode_image(self.latest_args[1]))
122
 
123
-
124
  async def video_emit(self):
125
  frame = await wait_for_item(self.video_queue, 0.01)
126
  if frame is not None:
@@ -139,18 +108,13 @@ class GeminiHandler(AsyncAudioVideoStreamHandler):
139
  array = await wait_for_item(self.audio_queue, 0.01)
140
  if array is not None:
141
  return (self.output_sample_rate, array)
142
- return array # Returns None if no item, which is handled by Gradio
143
 
144
  async def shutdown(self) -> None:
145
  if self.session:
146
  self.quit.set()
147
- try:
148
- await self.session.close()
149
- except Exception as e:
150
- print(f"Error closing session: {e}")
151
- finally:
152
- self.session = None # Ensure session is cleared
153
- self.quit.clear()
154
 
155
 
156
  stream = Stream(
@@ -182,7 +146,7 @@ with gr.Blocks(css=css) as demo:
182
  <h1>Gen AI Voice Chat</h1>
183
  <p>real-time audio streaming</p>
184
  </center>
185
- </div>
186
  """
187
  )
188
  with gr.Row() as row:
@@ -197,19 +161,14 @@ with gr.Blocks(css=css) as demo:
197
  pulse_color="rgb(255, 255, 255)",
198
  icon_button_color="rgb(255, 255, 255)",
199
  )
200
- # The additional_inputs in Stream handles the image input now.
201
- # If you need a separate gr.Image component for other purposes, you can uncomment it.
202
- # with gr.Column():
203
- # image_input = gr.Image(
204
- # label="Image", type="numpy", sources=["upload", "clipboard"]
205
- # )
206
-
207
- # The stream method now correctly uses the handler instance from the Stream object
208
- # and correctly wires up inputs and outputs.
209
- # The additional_inputs (like an image) are passed via self.latest_args in the handler.
210
  webrtc.stream(
211
- stream.handler_instance, # Use the handler instance from the Stream object
212
- inputs=[webrtc] + stream.additional_inputs_queue_proxies, # Pass webrtc and additional inputs
213
  outputs=[webrtc],
214
  time_limit=180 if get_space() else None,
215
  concurrency_limit=2 if get_space() else None,
@@ -224,4 +183,4 @@ if __name__ == "__main__":
224
  elif mode == "PHONE":
225
  raise ValueError("Phone mode not supported for this demo")
226
  else:
227
- stream.ui.launch(server_port=7860)
 
62
  api_key=os.getenv("GEMINI_API_KEY"), http_options={"api_version": "v1alpha"}
63
  )
64
  config = {"response_modalities": ["AUDIO"]}
 
 
 
 
 
 
 
 
 
65
  async with client.aio.live.connect(
66
  model="gemini-2.0-flash-exp",
67
  config=config, # type: ignore
68
  ) as session:
69
  self.session = session
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  while not self.quit.is_set():
71
  turn = self.session.receive()
72
  try:
73
  async for response in turn:
74
+ if data := response.data:
75
  audio = np.frombuffer(data, dtype=np.int16).reshape(1, -1)
76
+ self.audio_queue.put_nowait(audio)
 
 
77
  except websockets.exceptions.ConnectionClosedOK:
78
+ print("connection closed")
79
  break
 
 
 
80
 
81
  async def video_receive(self, frame: np.ndarray):
82
  self.video_queue.put_nowait(frame)
83
 
84
  if self.session:
85
  # send image every 1 second
86
+ print(time.time() - self.last_frame_time)
87
  if time.time() - self.last_frame_time > 1:
88
  self.last_frame_time = time.time()
89
  await self.session.send(input=encode_image(frame))
90
+ if self.latest_args[1] is not None:
 
91
  await self.session.send(input=encode_image(self.latest_args[1]))
92
 
 
93
  async def video_emit(self):
94
  frame = await wait_for_item(self.video_queue, 0.01)
95
  if frame is not None:
 
108
  array = await wait_for_item(self.audio_queue, 0.01)
109
  if array is not None:
110
  return (self.output_sample_rate, array)
111
+ return array
112
 
113
  async def shutdown(self) -> None:
114
  if self.session:
115
  self.quit.set()
116
+ await self.session.close()
117
+ self.quit.clear()
 
 
 
 
 
118
 
119
 
120
  stream = Stream(
 
146
  <h1>Gen AI Voice Chat</h1>
147
  <p>real-time audio streaming</p>
148
  </center>
149
+ </div>
150
  """
151
  )
152
  with gr.Row() as row:
 
161
  pulse_color="rgb(255, 255, 255)",
162
  icon_button_color="rgb(255, 255, 255)",
163
  )
164
+ #with gr.Column():
165
+ #image_input = gr.Image(
166
+ #label="Image", type="numpy", sources=["upload", "clipboard"]
167
+ #)
168
+
 
 
 
 
 
169
  webrtc.stream(
170
+ GeminiHandler(),
171
+ inputs=[webrtc],
172
  outputs=[webrtc],
173
  time_limit=180 if get_space() else None,
174
  concurrency_limit=2 if get_space() else None,
 
183
  elif mode == "PHONE":
184
  raise ValueError("Phone mode not supported for this demo")
185
  else:
186
+ stream.ui.launch(server_port=7860)