gemini-webrtc

Running

App Files Files Community

mgokg commited on May 15

Commit

bbf9a20

verified ·

1 Parent(s): f32d50e

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -58

app.py CHANGED Viewed

@@ -62,65 +62,34 @@ class GeminiHandler(AsyncAudioVideoStreamHandler):
             api_key=os.getenv("GEMINI_API_KEY"), http_options={"api_version": "v1alpha"}
         )
         config = {"response_modalities": ["AUDIO"]}
-        # --- System Message Definition ---
-        # You can customize this message to set the context for the AI.
-        system_message = (
-            "du bist ein echtzeitübersetzer für deutsch auf italienisch und italienisch auf deutsch. erläre nicht, kommentiere nicht, füge nichts hinzu, nur übersetzen"
-        )
-        # --- End of System Message Definition ---
         async with client.aio.live.connect(
             model="gemini-2.0-flash-exp",
             config=config,  # type: ignore
         ) as session:
             self.session = session
-            # --- Send the System Message ---
-            if system_message:
-                print(f"Sending system message to Gemini: '{system_message}'")
-                try:
-                    # Send the system message as the first input to the model
-                    await self.session.send(input=system_message)
-                    # Note: The model might provide an audio response to this system message,
-                    # which will be handled by the loop below.
-                    # If this initial audio response is not desired, further handling might be needed,
-                    # or the system message phrased to not elicit a direct spoken reply.
-                except Exception as e:
-                    print(f"Error sending system message: {e}")
-            # --- End of Sending System Message ---
             while not self.quit.is_set():
                 turn = self.session.receive()
                 try:
                     async for response in turn:
-                        if data := response.data:  # Assumes response.data contains audio
                             audio = np.frombuffer(data, dtype=np.int16).reshape(1, -1)
-                            self.audio_queue.put_nowait(audio)
-                        # If you expect other types of responses (e.g., text parts) from the model,
-                        # you would need to handle `response.text` or other fields here.
                 except websockets.exceptions.ConnectionClosedOK:
-                    print("Connection closed by server.")
                     break
-                except Exception as e:
-                    print(f"Error during session receive: {e}")
-                    break # Or implement more robust error handling
     async def video_receive(self, frame: np.ndarray):
         self.video_queue.put_nowait(frame)
         if self.session:
             # send image every 1 second
-            # print(time.time() - self.last_frame_time) # For debugging frame send rate
             if time.time() - self.last_frame_time > 1:
                 self.last_frame_time = time.time()
                 await self.session.send(input=encode_image(frame))
-                # Check if additional image input is provided and send it
-                if len(self.latest_args) > 1 and self.latest_args[1] is not None:
                     await self.session.send(input=encode_image(self.latest_args[1]))
     async def video_emit(self):
         frame = await wait_for_item(self.video_queue, 0.01)
         if frame is not None:
@@ -139,18 +108,13 @@ class GeminiHandler(AsyncAudioVideoStreamHandler):
         array = await wait_for_item(self.audio_queue, 0.01)
         if array is not None:
             return (self.output_sample_rate, array)
-        return array # Returns None if no item, which is handled by Gradio
     async def shutdown(self) -> None:
         if self.session:
             self.quit.set()
-            try:
-                await self.session.close()
-            except Exception as e:
-                print(f"Error closing session: {e}")
-            finally:
-                self.session = None # Ensure session is cleared
-                self.quit.clear()
 stream = Stream(
@@ -182,7 +146,7 @@ with gr.Blocks(css=css) as demo:
             <h1>Gen AI Voice Chat</h1>
             <p>real-time audio streaming</p>
           </center>
-        </div>
     """
     )
     with gr.Row() as row:
@@ -197,19 +161,14 @@ with gr.Blocks(css=css) as demo:
                 pulse_color="rgb(255, 255, 255)",
                 icon_button_color="rgb(255, 255, 255)",
             )
-        # The additional_inputs in Stream handles the image input now.
-        # If you need a separate gr.Image component for other purposes, you can uncomment it.
-        # with gr.Column():
-            # image_input = gr.Image(
-                # label="Image", type="numpy", sources=["upload", "clipboard"]
-            # )
-        # The stream method now correctly uses the handler instance from the Stream object
-        # and correctly wires up inputs and outputs.
-        # The additional_inputs (like an image) are passed via self.latest_args in the handler.
         webrtc.stream(
-            stream.handler_instance, # Use the handler instance from the Stream object
-            inputs=[webrtc] + stream.additional_inputs_queue_proxies, # Pass webrtc and additional inputs
             outputs=[webrtc],
             time_limit=180 if get_space() else None,
             concurrency_limit=2 if get_space() else None,
@@ -224,4 +183,4 @@ if __name__ == "__main__":
     elif mode == "PHONE":
         raise ValueError("Phone mode not supported for this demo")
     else:
-        stream.ui.launch(server_port=7860)

             api_key=os.getenv("GEMINI_API_KEY"), http_options={"api_version": "v1alpha"}
         )
         config = {"response_modalities": ["AUDIO"]}
         async with client.aio.live.connect(
             model="gemini-2.0-flash-exp",
             config=config,  # type: ignore
         ) as session:
             self.session = session
             while not self.quit.is_set():
                 turn = self.session.receive()
                 try:
                     async for response in turn:
+                        if data := response.data:
                             audio = np.frombuffer(data, dtype=np.int16).reshape(1, -1)
+                        self.audio_queue.put_nowait(audio)
                 except websockets.exceptions.ConnectionClosedOK:
+                    print("connection closed")
                     break
     async def video_receive(self, frame: np.ndarray):
         self.video_queue.put_nowait(frame)
         if self.session:
             # send image every 1 second
+            print(time.time() - self.last_frame_time)
             if time.time() - self.last_frame_time > 1:
                 self.last_frame_time = time.time()
                 await self.session.send(input=encode_image(frame))
+                if self.latest_args[1] is not None:
                     await self.session.send(input=encode_image(self.latest_args[1]))
     async def video_emit(self):
         frame = await wait_for_item(self.video_queue, 0.01)
         if frame is not None:
         array = await wait_for_item(self.audio_queue, 0.01)
         if array is not None:
             return (self.output_sample_rate, array)
+        return array
     async def shutdown(self) -> None:
         if self.session:
             self.quit.set()
+            await self.session.close()
+            self.quit.clear()
 stream = Stream(
             <h1>Gen AI Voice Chat</h1>
             <p>real-time audio streaming</p>
           </center>
+        </div>
     """
     )
     with gr.Row() as row:
                 pulse_color="rgb(255, 255, 255)",
                 icon_button_color="rgb(255, 255, 255)",
             )
+        #with gr.Column():
+            #image_input = gr.Image(
+                #label="Image", type="numpy", sources=["upload", "clipboard"]
+            #)
         webrtc.stream(
+            GeminiHandler(),
+            inputs=[webrtc],
             outputs=[webrtc],
             time_limit=180 if get_space() else None,
             concurrency_limit=2 if get_space() else None,
     elif mode == "PHONE":
         raise ValueError("Phone mode not supported for this demo")
     else:
+        stream.ui.launch(server_port=7860)