Spaces:

mgokg
/

MultimodalLiveApi

Sleeping

App Files Files Community

mgokg commited on Mar 4

Commit

eb44df9

verified ·

1 Parent(s): c1bc211

Update app.py

Browse files

Files changed (1) hide show

app.py +249 -217

app.py CHANGED Viewed

@@ -1,241 +1,262 @@
-# -*- coding: utf-8 -*-
-# Copyright 2025 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-## Setup
-The gradio-webrtc install fails unless you have ffmpeg@6, on mac:
-```
-brew uninstall ffmpeg
-brew install ffmpeg@6
-brew link ffmpeg@6
-```
-Create a virtual python environment, then install the dependencies for this script:
-```
-pip install websockets numpy gradio-webrtc "gradio>=5.9.1"
-```
-If installation fails it may be
-Before running this script, ensure the `GOOGLE_API_KEY` environment
-```
-$ export GOOGLE_API_KEY ='add your key here'
-```
-You can get an api-key from Google AI Studio (https://aistudio.google.com/apikey)
-## Run
-To run the script:
-```
-python gemini_gradio_audio.py
-```
-On the gradio page (http://127.0.0.1:7860/) click record, and talk, gemini will reply. But note that interruptions
-don't work.
-"""
 import os
 import base64
 import json
 import numpy as np
 import gradio as gr
-import websockets.sync.client
-from gradio_webrtc import StreamHandler, WebRTC
 __version__ = "0.0.3"
-#KEY_NAME="AIzaSyCWPviRPxj8IMLaijLGbRIsio3dO2rp3rU"
 # Configuration and Utilities
 class GeminiConfig:
     """Configuration settings for Gemini API."""
     def __init__(self):
-        self.api_key = os.environ.get(KEY_NAME)
         self.host = "generativelanguage.googleapis.com"
         self.model = "models/gemini-2.0-flash-exp"
-        self.ws_url = f"wss://{self.host}/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent?key={self.api_key}"
 class AudioProcessor:
     """Handles encoding and decoding of audio data."""
     @staticmethod
-    def encode_audio(data, sample_rate):
         """Encodes audio data to base64."""
         encoded = base64.b64encode(data.tobytes()).decode("UTF-8")
-        return {
-            "realtimeInput": {
-                "mediaChunks": [
-                    {
-                        "mimeType": f"audio/pcm;rate={sample_rate}",
-                        "data": encoded,
-                    }
-                ],
-            },
-        }
     @staticmethod
-    def process_audio_response(data):
         """Decodes audio data from base64."""
         audio_data = base64.b64decode(data)
         return np.frombuffer(audio_data, dtype=np.int16)
-# Gemini Interaction Handler
-class GeminiHandler(StreamHandler):
-    """Handles streaming interactions with the Gemini API."""
-    def __init__(self, expected_layout="mono", output_sample_rate=24000, output_frame_size=480) -> None:
-        super().__init__(expected_layout, output_sample_rate, output_frame_size, input_sample_rate=24000)
-        self.config = GeminiConfig()
-        self.ws = None
-        self.all_output_data = None
-        self.audio_processor = AudioProcessor()
-    def copy(self):
-        """Creates a copy of the GeminiHandler instance."""
-        return GeminiHandler(
-            expected_layout=self.expected_layout,
-            output_sample_rate=self.output_sample_rate,
-            output_frame_size=self.output_frame_size,
-        )
-    def _initialize_websocket(self):
-        """Initializes the WebSocket connection to the Gemini API."""
-        try:
-            self.ws = websockets.sync.client.connect(self.config.ws_url, timeout=3000)
-            initial_request = {"setup": {"model": self.config.model,"tools":[{"google_search": {}}]}}
-            self.ws.send(json.dumps(initial_request))
-            setup_response = json.loads(self.ws.recv())
-            print(f"Setup response: {setup_response}")
-        except websockets.exceptions.WebSocketException as e:
-            print(f"WebSocket connection failed: {str(e)}")
-            self.ws = None
-        except Exception as e:
-            print(f"Setup failed: {str(e)}")
-            self.ws = None
-def receive(self, frame: tuple[int, np.ndarray]) -> None:
-    """Empfängt Audio-/Videodaten, kodiert sie und sendet sie an die Gemini API."""
-    try:
-        if not self.ws:
-            self._initialize_websocket()
-            if not self.ws:  # Überprüfen, ob die Verbindung erfolgreich ist
-                print("WebSocket-Verbindung konnte nicht hergestellt werden.")
-                return  # Frühzeitiger Rückkehr, wenn die Verbindung fehlschlägt
-        sample_rate, array = frame
-        message = {"realtimeInput": {"mediaChunks": []}}
-        if sample_rate > 0 and array is not None:
-            array = array.squeeze()
-            audio_data = self.audio_processor.encode_audio(array, self.output_sample_rate)
-            message["realtimeInput"]["mediaChunks"].append({
-                "mimeType": f"audio/pcm;rate={self.output_sample_rate}",
-                "data": audio_data["realtimeInput"]["mediaChunks"][0]["data"],
-            })
-        if message["realtimeInput"]["mediaChunks"]:
-            self.ws.send(json.dumps(message))
-    except Exception as e:
-        print(f"Fehler beim Empfangen: {str(e)}")
-        if self.ws:
-            self.ws.close()
-        self.ws = None
-    def _process_server_content(self, content):
-        """Processes audio output data from the WebSocket response."""
-        for part in content.get("parts", []):
-            data = part.get("inlineData", {}).get("data", "")
-            if data:
-                audio_array = self.audio_processor.process_audio_response(data)
-                if self.all_output_data is None:
-                    self.all_output_data = audio_array
-                else:
-                    self.all_output_data = np.concatenate((self.all_output_data, audio_array))
-                while self.all_output_data.shape[-1] >= self.output_frame_size:
-                    yield (self.output_sample_rate, self.all_output_data[: self.output_frame_size].reshape(1, -1))
-                    self.all_output_data = self.all_output_data[self.output_frame_size :]
-    def generator(self):
-        """Generates audio output from the WebSocket stream."""
-        while True:
-            if not self.ws:
-                print("WebSocket not connected")
-                yield None
-                continue
             try:
-                message = self.ws.recv(timeout=30)
-                msg = json.loads(message)
-                if "serverContent" in msg:
-                    content = msg["serverContent"].get("modelTurn", {})
-                    yield from self._process_server_content(content)
-            except TimeoutError:
-                print("Timeout waiting for server response")
-                yield None
             except Exception as e:
-                yield None
-    def emit(self) -> tuple[int, np.ndarray] | None:
-        """Emits the next audio chunk from the generator."""
-        if not self.ws:
-            return None
-        if not hasattr(self, "_generator"):
-            self._generator = self.generator()
-        try:
-            return next(self._generator)
-        except StopIteration:
-            self.reset()
-            return None
-    def reset(self) -> None:
-        """Resets the generator and output data."""
-        if hasattr(self, "_generator"):
-            delattr(self, "_generator")
-        self.all_output_data = None
-    def shutdown(self) -> None:
-        """Closes the WebSocket connection."""
-        if self.ws:
-            self.ws.close()
-    def check_connection(self):
-        """Checks if the WebSocket connection is active."""
-        try:
-            if not self.ws or self.ws.closed:
-                self._initialize_websocket()
-            return True
-        except Exception as e:
-            print(f"Connection check failed: {str(e)}")
-            return False
-# Main Gradio Interface
-def registry(
-        name: str,
-        token: str | None = None,
-        **kwargs
 ):
     """Sets up and returns the Gradio interface."""
-    api_key = token or os.environ.get(KEY_NAME)
-    if not api_key:
-        raise ValueError(f"{KEY_NAME} environment variable is not set.")
     interface = gr.Blocks()
     with interface:
@@ -248,21 +269,32 @@ def registry(
                     </div>
                     """
                 )
-                gemini_handler = GeminiHandler()
                 with gr.Row():
-                    audio = WebRTC(label="Voice Chat", modality="audio", mode="send-receive")
-                audio.stream(
-                    gemini_handler,
-                    inputs=[audio],
-                    outputs=[audio],
-                    time_limit=600,
-                    concurrency_limit=10
                 )
     return interface
 # Launch the Gradio interface
-gr.load(
-    name='gemini-2.0-flash-exp',
-    src=registry,
-).launch()

 import os
 import base64
 import json
 import numpy as np
 import gradio as gr
+# import websockets.sync.client  # No longer needed with FastRTC
+from fastrtc import (
+    PeerConnection,
+    DataChannel,
+    MediaStreamTrack,
+    AudioFrame,
+    VideoFrame,
+)  # Import FastRTC components
+from aiortc.contrib.media import MediaPlayer, MediaRelay
+import asyncio
 __version__ = "0.0.3"
+# KEY_NAME = "AIzaSyCWPviRPxj8IMLaijLGbRIsio3dO2rp3rU"  #  Best practice:  Keep API keys out of the main code.  Use environment variables.
 # Configuration and Utilities
 class GeminiConfig:
     """Configuration settings for Gemini API."""
     def __init__(self):
+        self.api_key = os.environ.get("GEMINI_API_KEY")  # Use a more descriptive name
+        if not self.api_key:
+            raise ValueError("GEMINI_API_KEY environment variable is not set.")
         self.host = "generativelanguage.googleapis.com"
         self.model = "models/gemini-2.0-flash-exp"
+        #  FastRTC doesn't use WebSockets directly in the same way.  We'll handle the API calls differently.
+        self.base_url = f"https://{self.host}/v1alpha/{self.model}:streamGenerateContent?key={self.api_key}"
 class AudioProcessor:
     """Handles encoding and decoding of audio data."""
     @staticmethod
+    def encode_audio(data: np.ndarray, sample_rate: int) -> str:
         """Encodes audio data to base64."""
+        # Ensure data is in the correct format (int16)
+        if data.dtype != np.int16:
+            data = data.astype(np.int16)
         encoded = base64.b64encode(data.tobytes()).decode("UTF-8")
+        return encoded
     @staticmethod
+    def process_audio_response(data: str) -> np.ndarray:
         """Decodes audio data from base64."""
         audio_data = base64.b64decode(data)
         return np.frombuffer(audio_data, dtype=np.int16)
+#  We don't need a StreamHandler in the same way with FastRTC.  We'll handle streaming directly.
+class GeminiHandler:
+    """Handles interactions with the Gemini API."""
+    def __init__(self, output_sample_rate=24000, output_frame_size=480):
+        self.config = GeminiConfig()
+        self.audio_processor = AudioProcessor()
+        self.output_sample_rate = output_sample_rate
+        self.output_frame_size = output_frame_size
+        self.all_output_data = None
+        self.pc = None  # PeerConnection
+        self.dc = None  # DataChannel
+        self.audio_track = None
+        self._audio_buffer = []
+        self.relay = MediaRelay()
+    async def _send_audio_to_gemini(self, encoded_audio: str):
+        """Sends audio data to the Gemini API and processes the response."""
+        headers = {"Content-Type": "application/json"}
+        payload = {
+            "contents": [
+                {
+                    "parts": [
+                        {
+                            "text": "Respond to the audio with audio."
+                        },  #  Initial prompt, can be adjusted
+                        {"inline_data": {"mime_type": "audio/pcm;rate=24000", "data": encoded_audio}},
+                    ]
+                }
+            ]
+        }
+        # Use aiohttp for asynchronous HTTP requests
+        import aiohttp
+        async with aiohttp.ClientSession() as session:
+            async with session.post(
+                self.config.base_url, headers=headers, data=json.dumps(payload)
+            ) as response:
+                if response.status != 200:
+                    print(f"Error: Gemini API returned status {response.status}")
+                    print(await response.text())
+                    return
+                async for line in response.content:
+                    try:
+                        line = line.strip()
+                        if not line:
+                            continue
+                        #  Responses are chunked, often with multiple JSON objects per chunk.  Handle that.
+                        for chunk in line.decode("utf-8").split("\n"):
+                            if not chunk.strip():
+                                continue
+                            try:
+                                data = json.loads(chunk)
+                            except json.JSONDecodeError:
+                                print(f"JSONDecodeError: {chunk}")
+                                continue
+                            if "candidates" in data:
+                                for candidate in data["candidates"]:
+                                    for part in candidate.get("content", {}).get("parts", []):
+                                        if "inlineData" in part:
+                                            audio_data = part["inlineData"].get("data", "")
+                                            if audio_data:
+                                                await self._process_server_audio(audio_data)
+                    except Exception as e:
+                        print(f"Error processing response chunk: {e}")
+    async def _process_server_audio(self, audio_data: str):
+        """Processes and buffers audio data received from the server."""
+        audio_array = self.audio_processor.process_audio_response(audio_data)
+        if self.all_output_data is None:
+            self.all_output_data = audio_array
+        else:
+            self.all_output_data = np.concatenate((self.all_output_data, audio_array))
+        while self.all_output_data.shape[-1] >= self.output_frame_size:
+            frame = AudioFrame(
+                samples=self.output_frame_size,
+                sample_rate=self.output_sample_rate,
+                layout="mono", # mono channel
+                data=self.all_output_data[: self.output_frame_size].tobytes()
+            )
+            self.all_output_data = self.all_output_data[self.output_frame_size:]
+            if self.audio_track:
+                await self.audio_track.emit(frame)
+    async def on_track(self, track):
+        """Handles incoming media tracks."""
+        print(f"Track received: {track.kind}")
+        if track.kind == "audio":
+            self.audio_track = track  # Store the audio track
+            @track.on("frame")
+            async def on_frame(frame):
+                # Process received audio frames
+                if isinstance(frame, AudioFrame):
+                    try:
+                        # Convert the frame data to a NumPy array
+                        audio_data = np.frombuffer(frame.data, dtype=np.int16)
+                        # Encode the audio and send it to Gemini
+                        encoded_audio = self.audio_processor.encode_audio(
+                            audio_data, frame.sample_rate
+                        )  # Pass sample rate
+                        await self._send_audio_to_gemini(encoded_audio)
+                    except Exception as e:
+                        print(f"Error processing audio frame: {e}")
+    async def on_datachannel(self, channel):
+        """Handles data channel events (not used in this example, but good practice)."""
+        self.dc = channel
+        print("Data channel created")
+        @channel.on("message")
+        async def on_message(message):
+            print(f"Received message: {message}")
+    async def connect(self):
+        """Establishes the PeerConnection."""
+        self.pc = PeerConnection()
+        self.pc.on("track", self.on_track)
+        self.pc.on("datachannel", self.on_datachannel)
+        #  Create a local audio track to send data
+        self.local_audio_player = MediaPlayer("default", format="avfoundation", options={"channels": "1", "sample_rate": str(self.output_sample_rate)})
+        self.local_audio = self.relay.subscribe(self.local_audio_player.audio)
+        self.pc.addTrack(self.local_audio)
+        #  Add a data channel (optional, but good practice)
+        self.dc = self.pc.createDataChannel("data")
+        #  Create an offer and set local description
+        offer = await self.pc.createOffer()
+        await self.pc.setLocalDescription(offer)
+        print("PeerConnection established")
+        return self.pc.localDescription
+    async def set_remote_description(self, sdp, type):
+        """Sets the remote description."""
+        from aiortc import RTCSessionDescription
+        await self.pc.setRemoteDescription(RTCSessionDescription(sdp=sdp, type=type))
+        print("Remote description set")
+        if self.pc.remoteDescription.type == "offer":
+            answer = await self.pc.createAnswer()
+            await self.pc.setLocalDescription(answer)
+            return self.pc.localDescription
+    async def add_ice_candidate(self, candidate, sdpMid, sdpMLineIndex):
+        """Adds an ICE candidate."""
+        from aiortc import RTCIceCandidate
+        if candidate:
             try:
+                ice_candidate = RTCIceCandidate(
+                    candidate=candidate, sdpMid=sdpMid, sdpMLineIndex=sdpMLineIndex
+                )
+                await self.pc.addIceCandidate(ice_candidate)
+                print("ICE candidate added")
             except Exception as e:
+                print(f"Error adding ICE candidate: {e}")
+    def shutdown(self):
+        """Closes the PeerConnection."""
+        if self.pc:
+            asyncio.create_task(self.pc.close())  # Close in the background
+            self.pc = None
+        print("PeerConnection closed")
+# Gradio Interface
+async def registry(
+    name: str,
+    token: str | None = None,
+    **kwargs,
 ):
     """Sets up and returns the Gradio interface."""
+    gemini_handler = GeminiHandler()
+    async def connect_webrtc(sdp, type, candidates):
+        """Connects to the WebRTC client and handles ICE candidates."""
+        if gemini_handler.pc is None:
+            local_description = await gemini_handler.connect()
+            if local_description:
+                yield json.dumps(
+                    {
+                        "sdp": local_description.sdp,
+                        "type": local_description.type,
+                        "candidates": [],
+                    }
+                )  # Return initial SDP
+        if sdp and type:
+            answer = await gemini_handler.set_remote_description(sdp, type)
+            if answer:
+                yield json.dumps({"sdp": answer.sdp, "type": answer.type, "candidates": []})
+        for candidate in candidates:
+            if candidate and candidate.get("candidate"):
+                await gemini_handler.add_ice_candidate(
+                    candidate["candidate"], candidate.get("sdpMid"), candidate.get("sdpMLineIndex")
+                )
+        yield json.dumps({"sdp": "", "type": "", "candidates": []})  #  Signal completion
     interface = gr.Blocks()
     with interface:
                     </div>
                     """
                 )
                 with gr.Row():
+                    webrtc_out = gr.JSON(label="WebRTC JSON")
+                # Use the built-in WebRTC component, but without automatic streaming.
+                webrtc = gr.WebRTC(
+                    value={"sdp": "", "type": "", "candidates": []},
+                    interactive=True,
+                    label="Voice Chat",
+                )
+                connect_button = gr.Button("Connect")
+                connect_button.click(
+                    connect_webrtc,
+                    inputs=[
+                        webrtc
+                    ],  # Pass the WebRTC component's value (SDP, type, candidates)
+                    outputs=[webrtc_out], # show the webrtc connection data
                 )
     return interface
 # Launch the Gradio interface
+async def main():
+    interface = await registry(name="gemini-2.0-flash-exp")
+    interface.queue()  # Enable queuing for better concurrency
+    await interface.launch()
+if __name__ == "__main__":
+    asyncio.run(main())