gemini-webrtc

Running

App Files Files Community

mgokg commited on May 15

Commit

f32d50e

verified ·

1 Parent(s): c89afdc

Update app.py

Browse files

Files changed (1) hide show

app.py +58 -17

app.py CHANGED Viewed

@@ -62,34 +62,65 @@ class GeminiHandler(AsyncAudioVideoStreamHandler):
             api_key=os.getenv("GEMINI_API_KEY"), http_options={"api_version": "v1alpha"}
         )
         config = {"response_modalities": ["AUDIO"]}
         async with client.aio.live.connect(
             model="gemini-2.0-flash-exp",
             config=config,  # type: ignore
         ) as session:
             self.session = session
             while not self.quit.is_set():
                 turn = self.session.receive()
                 try:
                     async for response in turn:
-                        if data := response.data:
                             audio = np.frombuffer(data, dtype=np.int16).reshape(1, -1)
-                        self.audio_queue.put_nowait(audio)
                 except websockets.exceptions.ConnectionClosedOK:
-                    print("connection closed")
                     break
     async def video_receive(self, frame: np.ndarray):
         self.video_queue.put_nowait(frame)
         if self.session:
             # send image every 1 second
-            print(time.time() - self.last_frame_time)
             if time.time() - self.last_frame_time > 1:
                 self.last_frame_time = time.time()
                 await self.session.send(input=encode_image(frame))
-                if self.latest_args[1] is not None:
                     await self.session.send(input=encode_image(self.latest_args[1]))
     async def video_emit(self):
         frame = await wait_for_item(self.video_queue, 0.01)
         if frame is not None:
@@ -108,13 +139,18 @@ class GeminiHandler(AsyncAudioVideoStreamHandler):
         array = await wait_for_item(self.audio_queue, 0.01)
         if array is not None:
             return (self.output_sample_rate, array)
-        return array
     async def shutdown(self) -> None:
         if self.session:
             self.quit.set()
-            await self.session.close()
-            self.quit.clear()
 stream = Stream(
@@ -146,7 +182,7 @@ with gr.Blocks(css=css) as demo:
             <h1>Gen AI Voice Chat</h1>
             <p>real-time audio streaming</p>
           </center>
-        </div>
     """
     )
     with gr.Row() as row:
@@ -161,14 +197,19 @@ with gr.Blocks(css=css) as demo:
                 pulse_color="rgb(255, 255, 255)",
                 icon_button_color="rgb(255, 255, 255)",
             )
-        #with gr.Column():
-            #image_input = gr.Image(
-                #label="Image", type="numpy", sources=["upload", "clipboard"]
-            #)
         webrtc.stream(
-            GeminiHandler(),
-            inputs=[webrtc],
             outputs=[webrtc],
             time_limit=180 if get_space() else None,
             concurrency_limit=2 if get_space() else None,
@@ -183,4 +224,4 @@ if __name__ == "__main__":
     elif mode == "PHONE":
         raise ValueError("Phone mode not supported for this demo")
     else:
-        stream.ui.launch(server_port=7860)

             api_key=os.getenv("GEMINI_API_KEY"), http_options={"api_version": "v1alpha"}
         )
         config = {"response_modalities": ["AUDIO"]}
+        # --- System Message Definition ---
+        # You can customize this message to set the context for the AI.
+        system_message = (
+            "du bist ein echtzeitübersetzer für deutsch auf italienisch und italienisch auf deutsch. erläre nicht, kommentiere nicht, füge nichts hinzu, nur übersetzen"
+        )
+        # --- End of System Message Definition ---
         async with client.aio.live.connect(
             model="gemini-2.0-flash-exp",
             config=config,  # type: ignore
         ) as session:
             self.session = session
+            # --- Send the System Message ---
+            if system_message:
+                print(f"Sending system message to Gemini: '{system_message}'")
+                try:
+                    # Send the system message as the first input to the model
+                    await self.session.send(input=system_message)
+                    # Note: The model might provide an audio response to this system message,
+                    # which will be handled by the loop below.
+                    # If this initial audio response is not desired, further handling might be needed,
+                    # or the system message phrased to not elicit a direct spoken reply.
+                except Exception as e:
+                    print(f"Error sending system message: {e}")
+            # --- End of Sending System Message ---
             while not self.quit.is_set():
                 turn = self.session.receive()
                 try:
                     async for response in turn:
+                        if data := response.data:  # Assumes response.data contains audio
                             audio = np.frombuffer(data, dtype=np.int16).reshape(1, -1)
+                            self.audio_queue.put_nowait(audio)
+                        # If you expect other types of responses (e.g., text parts) from the model,
+                        # you would need to handle `response.text` or other fields here.
                 except websockets.exceptions.ConnectionClosedOK:
+                    print("Connection closed by server.")
                     break
+                except Exception as e:
+                    print(f"Error during session receive: {e}")
+                    break # Or implement more robust error handling
     async def video_receive(self, frame: np.ndarray):
         self.video_queue.put_nowait(frame)
         if self.session:
             # send image every 1 second
+            # print(time.time() - self.last_frame_time) # For debugging frame send rate
             if time.time() - self.last_frame_time > 1:
                 self.last_frame_time = time.time()
                 await self.session.send(input=encode_image(frame))
+                # Check if additional image input is provided and send it
+                if len(self.latest_args) > 1 and self.latest_args[1] is not None:
                     await self.session.send(input=encode_image(self.latest_args[1]))
     async def video_emit(self):
         frame = await wait_for_item(self.video_queue, 0.01)
         if frame is not None:
         array = await wait_for_item(self.audio_queue, 0.01)
         if array is not None:
             return (self.output_sample_rate, array)
+        return array # Returns None if no item, which is handled by Gradio
     async def shutdown(self) -> None:
         if self.session:
             self.quit.set()
+            try:
+                await self.session.close()
+            except Exception as e:
+                print(f"Error closing session: {e}")
+            finally:
+                self.session = None # Ensure session is cleared
+                self.quit.clear()
 stream = Stream(
             <h1>Gen AI Voice Chat</h1>
             <p>real-time audio streaming</p>
           </center>
+        </div>
     """
     )
     with gr.Row() as row:
                 pulse_color="rgb(255, 255, 255)",
                 icon_button_color="rgb(255, 255, 255)",
             )
+        # The additional_inputs in Stream handles the image input now.
+        # If you need a separate gr.Image component for other purposes, you can uncomment it.
+        # with gr.Column():
+            # image_input = gr.Image(
+                # label="Image", type="numpy", sources=["upload", "clipboard"]
+            # )
+        # The stream method now correctly uses the handler instance from the Stream object
+        # and correctly wires up inputs and outputs.
+        # The additional_inputs (like an image) are passed via self.latest_args in the handler.
         webrtc.stream(
+            stream.handler_instance, # Use the handler instance from the Stream object
+            inputs=[webrtc] + stream.additional_inputs_queue_proxies, # Pass webrtc and additional inputs
             outputs=[webrtc],
             time_limit=180 if get_space() else None,
             concurrency_limit=2 if get_space() else None,
     elif mode == "PHONE":
         raise ValueError("Phone mode not supported for this demo")
     else:
+        stream.ui.launch(server_port=7860)