gemini-webrtc

Running

App Files Files Community

mgokg commited on May 15

Commit

4380ffd

verified ·

1 Parent(s): 1208a72

Update app.py

Browse files

Files changed (1) hide show

app.py +182 -29

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ import base64
 import os
 import time
 from io import BytesIO
 import gradio as gr
 import numpy as np
@@ -16,11 +17,72 @@ from fastrtc import (
     wait_for_item,
 )
 from google import genai
 from gradio.utils import get_space
 from PIL import Image
 load_dotenv()
 def encode_audio(data: np.ndarray) -> dict:
     """Encode Audio data to send to the server"""
@@ -53,68 +115,148 @@ class GeminiHandler(AsyncAudioVideoStreamHandler):
         self.session = None
         self.last_frame_time = 0
         self.quit = asyncio.Event()
     def copy(self) -> "GeminiHandler":
         return GeminiHandler()
     async def start_up(self):
-        client = genai.Client(
             api_key=os.getenv("GEMINI_API_KEY"), http_options={"api_version": "v1alpha"}
         )
-        config = {"response_modalities": ["AUDIO"]}
-        async with client.aio.live.connect(
-            model="gemini-2.0-flash-exp",
-            config=config,  # type: ignore
         ) as session:
             self.session = session
             while not self.quit.is_set():
                 turn = self.session.receive()
                 try:
-                    async for response in turn:
-                        if data := response.data:
                             audio = np.frombuffer(data, dtype=np.int16).reshape(1, -1)
-                        self.audio_queue.put_nowait(audio)
                 except websockets.exceptions.ConnectionClosedOK:
-                    print("connection closed")
                     break
     async def video_receive(self, frame: np.ndarray):
         self.video_queue.put_nowait(frame)
         if self.session:
-            # send image every 1 second
-            print(time.time() - self.last_frame_time)
-            if time.time() - self.last_frame_time > 1:
-                self.last_frame_time = time.time()
                 await self.session.send(input=encode_image(frame))
-                if self.latest_args[1] is not None:
-                    await self.session.send(input=encode_image(self.latest_args[1]))
     async def video_emit(self):
         frame = await wait_for_item(self.video_queue, 0.01)
         if frame is not None:
             return frame
         else:
-            return np.zeros((100, 100, 3), dtype=np.uint8)
     async def receive(self, frame: tuple[int, np.ndarray]) -> None:
         _, array = frame
         array = array.squeeze()
-        audio_message = encode_audio(array)
         if self.session:
-            await self.session.send(input=audio_message)
     async def emit(self):
         array = await wait_for_item(self.audio_queue, 0.01)
         if array is not None:
             return (self.output_sample_rate, array)
-        return array
     async def shutdown(self) -> None:
         if self.session:
             self.quit.set()
-            await self.session.close()
-            self.quit.clear()
 stream = Stream(
@@ -130,7 +272,7 @@ stream = Stream(
         "icon": "https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png",
         "pulse_color": "rgb(255, 255, 255)",
         "icon_button_color": "rgb(255, 255, 255)",
-        "title": "Gemini Audio Video Chat",
     },
 )
@@ -146,10 +288,11 @@ with gr.Blocks(css=css) as demo:
             <img src="https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png" style="width: 100px; height: 100px;">
         </div>
         <div>
-            <h1>Gen AI SDK Voice Chat</h1>
-            <p>Speak with Gemini using real-time audio + video streaming</p>
             <p>Powered by <a href="https://gradio.app/">Gradio</a> and <a href=https://freddyaboulton.github.io/gradio-webrtc/">WebRTC</a>⚡️</p>
-            <p>Get an API Key <a href="https://support.google.com/googleapi/answer/6158862?hl=en">here</a></p>
         </div>
     </div>
     """
@@ -168,12 +311,15 @@ with gr.Blocks(css=css) as demo:
             )
         with gr.Column():
             image_input = gr.Image(
-                label="Image", type="numpy", sources=["upload", "clipboard"]
             )
         webrtc.stream(
-            GeminiHandler(),
-            inputs=[webrtc, image_input],
             outputs=[webrtc],
             time_limit=180 if get_space() else None,
             concurrency_limit=2 if get_space() else None,
@@ -183,9 +329,16 @@ stream.ui = demo
 if __name__ == "__main__":
     if (mode := os.getenv("MODE")) == "UI":
         stream.ui.launch(server_port=7860)
     elif mode == "PHONE":
         raise ValueError("Phone mode not supported for this demo")
     else:
-        stream.ui.launch(server_port=7860)

 import os
 import time
 from io import BytesIO
+import functools # Added for to_thread
 import gradio as gr
 import numpy as np
     wait_for_item,
 )
 from google import genai
+# Ensure genai.protos is accessible for Tool, FunctionDeclaration etc.
+# If not, you might need from google.generativeai.types import Tool, FunctionDeclaration, Schema, Part, Content
+# However, with live.connect using v1alpha, direct proto usage is often needed.
+from google.generativeai import protos # Explicitly import protos
 from gradio.utils import get_space
 from PIL import Image
+from googleapiclient.discovery import build # Added for Google Search
 load_dotenv()
+# --- Environment Variables for Google Search ---
+GOOGLE_SEARCH_API_KEY = os.getenv("GOOGLE_SEARCH_API_KEY")
+GOOGLE_CSE_ID = os.getenv("GOOGLE_CSE_ID")
+# --- Google Search Function ---
+async def perform_google_search_async(query: str, num_results: int = 3) -> str:
+    """
+    Performs a Google search using the Custom Search API and returns formatted results.
+    """
+    if not GOOGLE_SEARCH_API_KEY or not GOOGLE_CSE_ID:
+        print("Google Search API key or CSE ID not configured.")
+        return "Search functionality is not configured."
+    try:
+        loop = asyncio.get_running_loop()
+        # Create a partial function for the blocking call
+        partial_search = functools.partial(
+            build("customsearch", "v1", developerKey=GOOGLE_SEARCH_API_KEY).cse().list(
+                q=query, cx=GOOGLE_CSE_ID, num=num_results
+            ).execute
+        )
+        # Run the blocking call in a thread pool
+        res = await loop.run_in_executor(None, partial_search)
+        if 'items' in res and res['items']:
+            results = []
+            for item in res['items']:
+                title = item.get('title', 'N/A')
+                link = item.get('link', 'N/A')
+                snippet = item.get('snippet', 'N/A').replace("\n", " ")
+                results.append(f"Title: {title}\nLink: {link}\nSnippet: {snippet}\n---")
+            return "\n".join(results)
+        else:
+            return "No search results found."
+    except Exception as e:
+        print(f"Error during Google Search: {e}")
+        return f"An error occurred while searching: {str(e)}"
+# --- Define the Google Search Tool for Gemini ---
+# Using genai.protos directly as LiveSession client might expect raw protos
+google_search_tool = protos.Tool(
+    function_declarations=[
+        protos.FunctionDeclaration(
+            name="perform_google_search",
+            description="Performs a Google search for a given query and returns a summary of the top results. Use this for general web searches or finding specific information online.",
+            parameters=protos.Schema(
+                type=protos.Type.OBJECT,
+                properties={
+                    "query": protos.Schema(type=protos.Type.STRING, description="The search query to use for Google Search."),
+                    "num_results": protos.Schema(type=protos.Type.NUMBER, description="Optional. Number of search results to return (default is 3).")
+                },
+                required=["query"]
+            )
+        )
+    ]
+)
 def encode_audio(data: np.ndarray) -> dict:
     """Encode Audio data to send to the server"""
         self.session = None
         self.last_frame_time = 0
         self.quit = asyncio.Event()
+        self.client = None # Store client
     def copy(self) -> "GeminiHandler":
         return GeminiHandler()
     async def start_up(self):
+        self.client = genai.Client( # Use self.client
             api_key=os.getenv("GEMINI_API_KEY"), http_options={"api_version": "v1alpha"}
         )
+        # Configure Gemini to use the search tool
+        # Note: For v1alpha live client, config might be a dict or protos.StreamingConfig
+        # protos.ToolConfig and protos.FunctionCallingConfig might be needed for more control
+        # e.g. tool_config=protos.ToolConfig(function_calling_config=protos.FunctionCallingConfig(mode=protos.FunctionCallingConfig.Mode.ANY))
+        streaming_config = protos.StreamingConfig(
+            response_modalities=[protos.ResponseModality.AUDIO], # Use enum
+            tools=[google_search_tool]
+        )
+        # If you need to force tool usage or set mode:
+        # streaming_config.tool_config.CopyFrom(protos.ToolConfig(
+        #    function_calling_config=protos.FunctionCallingConfig(mode=protos.FunctionCallingConfig.Mode.ANY)
+        # ))
+        async with self.client.aio.live.connect(
+            model="gemini-2.0-flash-exp", # Or "gemini-1.5-flash-latest" which is known to support tools well
+            config=streaming_config,
         ) as session:
             self.session = session
+            print("Gemini session started.")
             while not self.quit.is_set():
                 turn = self.session.receive()
                 try:
+                    async for response_proto in turn: # response_proto is protos.Response
+                        # Check for function calls
+                        if response_proto.function_call and response_proto.function_call.name:
+                            fc = response_proto.function_call
+                            if fc.name == "perform_google_search":
+                                query = fc.args["query"]
+                                num_results = fc.args.get("num_results", 3) # Get optional num_results
+                                print(f"Gemini requested Google search for: '{query}' with {num_results} results.")
+                                search_results_text = await perform_google_search_async(query, int(num_results))
+                                print(f"Search results (first 200 chars): {search_results_text[:200]}...")
+                                # Send search results back to Gemini
+                                function_response_proto = protos.FunctionResponse(
+                                    name="perform_google_search",
+                                    response={"result": search_results_text} # Response must be a dict/struct
+                                )
+                                input_proto = protos.Input(function_response=function_response_proto)
+                                await self.session.send(input=input_proto)
+                                print("Sent search results back to Gemini.")
+                        # Handle audio data
+                        elif response_proto.audio_output and response_proto.audio_output.data:
+                            data = response_proto.audio_output.data
                             audio = np.frombuffer(data, dtype=np.int16).reshape(1, -1)
+                            self.audio_queue.put_nowait(audio)
+                        # You could also handle response_proto.text_output if needed
                 except websockets.exceptions.ConnectionClosedOK:
+                    print("Gemini session connection closed normally.")
                     break
+                except Exception as e:
+                    print(f"Error in Gemini session receive loop: {e}")
+                    # Consider how to handle errors, e.g., break or log and continue
+                    break # For now, break on error
+            print("Exited Gemini session receive loop.")
     async def video_receive(self, frame: np.ndarray):
         self.video_queue.put_nowait(frame)
         if self.session:
+            current_time = time.time()
+            if current_time - self.last_frame_time > 1: # Send image every 1 second
+                self.last_frame_time = current_time
+                # The original code sends a dict. For v1alpha, it might need to be wrapped in protos.Input
+                # For simplicity, keeping as dict and assuming SDK handles it.
+                # If issues, wrap: image_part = protos.Part(inline_data=protos.Blob(mime_type="image/jpeg", data=...))
+                # input_proto = protos.Input(parts=[image_part])
+                # await self.session.send(input=input_proto)
                 await self.session.send(input=encode_image(frame))
+                # Handle additional image input from Gradio UI
+                if self.latest_args and len(self.latest_args) > 1 and self.latest_args[1] is not None:
+                    # Assuming self.latest_args[1] is the numpy array from the gr.Image input
+                    uploaded_image_data = self.latest_args[1]
+                    await self.session.send(input=encode_image(uploaded_image_data))
+                    # To avoid resending, you might want to clear it after sending
+                    # self.latest_args[1] = None # Or handle state more robustly
     async def video_emit(self):
         frame = await wait_for_item(self.video_queue, 0.01)
         if frame is not None:
             return frame
         else:
+            return np.zeros((100, 100, 3), dtype=np.uint8) # Default blank frame
     async def receive(self, frame: tuple[int, np.ndarray]) -> None:
+        # Audio from user's microphone
         _, array = frame
         array = array.squeeze()
+        audio_message_dict = encode_audio(array) # This is a dict
         if self.session:
+            # For v1alpha, input should be protos.Input.
+            # The SDK might convert the dict, but explicit is safer.
+            audio_data_bytes = base64.b64decode(audio_message_dict["data"])
+            audio_part = protos.Part(
+                audio_input=protos.AudioData(
+                    audio=audio_data_bytes,
+                    # sample_rate_hertz=self.input_sample_rate # If API needs it
+                )
+            )
+            input_proto = protos.Input(parts=[audio_part])
+            await self.session.send(input=input_proto)
     async def emit(self):
+        # Audio to user's speakers (from Gemini)
         array = await wait_for_item(self.audio_queue, 0.01)
         if array is not None:
             return (self.output_sample_rate, array)
+        return None # Return None if no audio, Gradio handles it
     async def shutdown(self) -> None:
+        print("Shutting down GeminiHandler...")
         if self.session:
             self.quit.set()
+            try:
+                await self.session.close()
+                print("Gemini session closed.")
+            except Exception as e:
+                print(f"Error closing Gemini session: {e}")
+        self.quit.clear()
+        # Clean up client if necessary, though it's managed by 'async with' in start_up
+        self.client = None
 stream = Stream(
         "icon": "https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png",
         "pulse_color": "rgb(255, 255, 255)",
         "icon_button_color": "rgb(255, 255, 255)",
+        "title": "Gemini Audio Video Chat with Search", # Updated title
     },
 )
             <img src="https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png" style="width: 100px; height: 100px;">
         </div>
         <div>
+            <h1>Gen AI SDK Voice Chat with Google Search</h1>
+            <p>Speak with Gemini using real-time audio + video streaming, now with Google Search capability!</p>
+            <p>Try saying: "Search for the weather in London" or "Google the latest AI news."</p>
             <p>Powered by <a href="https://gradio.app/">Gradio</a> and <a href=https://freddyaboulton.github.io/gradio-webrtc/">WebRTC</a>⚡️</p>
+            <p>Get a Gemini API Key <a href="https://aistudio.google.com/app/apikey">here</a>. You'll also need a Google Search API Key and CSE ID.</p>
         </div>
     </div>
     """
             )
         with gr.Column():
             image_input = gr.Image(
+                label="Image (optional, sent with video frames)", type="numpy", sources=["upload", "clipboard"]
             )
+        # The WebRTC.stream method will pass these inputs to the handler's methods.
+        # The handler's __init__ or other methods might need to store/access `image_input` if needed beyond `latest_args`.
+        # The `latest_args` in `video_receive` comes from the `inputs` list here.
         webrtc.stream(
+            GeminiHandler(), # A new instance of GeminiHandler for each stream session
+            inputs=[webrtc, image_input], # webrtc is args[0], image_input is args[1]
             outputs=[webrtc],
             time_limit=180 if get_space() else None,
             concurrency_limit=2 if get_space() else None,
 if __name__ == "__main__":
+    if not os.getenv("GEMINI_API_KEY"):
+        print("GEMINI_API_KEY not found in environment variables. Please set it in your .env file.")
+    if not GOOGLE_SEARCH_API_KEY or not GOOGLE_CSE_ID:
+        print("GOOGLE_SEARCH_API_KEY or GOOGLE_CSE_ID not found. Search functionality will be limited.")
     if (mode := os.getenv("MODE")) == "UI":
         stream.ui.launch(server_port=7860)
     elif mode == "PHONE":
         raise ValueError("Phone mode not supported for this demo")
     else:
+        # Default to UI launch if MODE is not set or unrecognized
+        print("Launching Gradio UI...")
+        stream.ui.launch(server_port=7860)