gemini-webrtc

Sleeping

App Files Files Community

mgokg commited on May 15

Commit

734afbb

verified ·

1 Parent(s): 57775e3

Update app.py

Browse files

Files changed (1) hide show

app.py +217 -62

app.py CHANGED Viewed

@@ -15,10 +15,14 @@ from fastrtc import (
     get_cloudflare_turn_credentials_async,
     wait_for_item,
 )
-from google import genai
 from gradio.utils import get_space
 from PIL import Image
 load_dotenv()
@@ -39,6 +43,42 @@ def encode_image(data: np.ndarray) -> dict:
     return {"mime_type": "image/jpeg", "data": base64_str}
 class GeminiHandler(AsyncAudioVideoStreamHandler):
     def __init__(
         self,
@@ -54,41 +94,127 @@ class GeminiHandler(AsyncAudioVideoStreamHandler):
         self.last_frame_time = 0
         self.quit = asyncio.Event()
     def copy(self) -> "GeminiHandler":
         return GeminiHandler()
     async def start_up(self):
-        client = genai.Client(
-            api_key=os.getenv("GEMINI_API_KEY"), http_options={"api_version": "v1alpha"}
-        )
-        config = {"response_modalities": ["AUDIO"]}
-        async with client.aio.live.connect(
-            model="gemini-2.0-flash-exp",
-            config=config,  # type: ignore
-        ) as session:
-            self.session = session
-            while not self.quit.is_set():
-                turn = self.session.receive()
-                try:
-                    async for response in turn:
-                        if data := response.data:
-                            audio = np.frombuffer(data, dtype=np.int16).reshape(1, -1)
-                        self.audio_queue.put_nowait(audio)
-                except websockets.exceptions.ConnectionClosedOK:
-                    print("connection closed")
-                    break
     async def video_receive(self, frame: np.ndarray):
         self.video_queue.put_nowait(frame)
-        if self.session:
-            # send image every 1 second
-            print(time.time() - self.last_frame_time)
-            if time.time() - self.last_frame_time > 1:
                 self.last_frame_time = time.time()
-                await self.session.send(input=encode_image(frame))
-                if self.latest_args[1] is not None:
-                    await self.session.send(input=encode_image(self.latest_args[1]))
     async def video_emit(self):
         frame = await wait_for_item(self.video_queue, 0.01)
@@ -97,40 +223,50 @@ class GeminiHandler(AsyncAudioVideoStreamHandler):
         else:
             return np.zeros((100, 100, 3), dtype=np.uint8)
-    async def receive(self, frame: tuple[int, np.ndarray]) -> None:
         _, array = frame
         array = array.squeeze()
         audio_message = encode_audio(array)
-        if self.session:
-            await self.session.send(input=audio_message)
-    async def emit(self):
         array = await wait_for_item(self.audio_queue, 0.01)
         if array is not None:
             return (self.output_sample_rate, array)
-        return array
     async def shutdown(self) -> None:
-        if self.session:
-            self.quit.set()
-            await self.session.close()
-            self.quit.clear()
-stream = Stream(
     handler=GeminiHandler(),
     modality="audio-video",
     mode="send-receive",
     rtc_configuration=get_cloudflare_turn_credentials_async,
     time_limit=180 if get_space() else None,
     additional_inputs=[
-        gr.Image(label="Image", type="numpy", sources=["upload", "clipboard"])
     ],
     ui_args={
         "icon": "https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png",
         "pulse_color": "rgb(255, 255, 255)",
         "icon_button_color": "rgb(255, 255, 255)",
-        "title": "Gemini Audio Video Chat",
     },
 )
@@ -138,22 +274,28 @@ css = """
 #video-source {max-width: 500px !important; max-height: 500px !important;}
 """
-with gr.Blocks(css=css) as demo:
     gr.HTML(
         """
         <div>
           <center>
-            <h1>Gen AI Voice Chat</h1>
-            <p>real-time audio + video streaming</p>
           </center>
         </div>
     """
     )
     with gr.Row() as row:
         with gr.Column():
             webrtc = WebRTC(
                 label="Video Chat",
-                modality="audio-video",
                 mode="send-receive",
                 elem_id="video-source",
                 rtc_configuration=get_cloudflare_turn_credentials_async,
@@ -161,26 +303,39 @@ with gr.Blocks(css=css) as demo:
                 pulse_color="rgb(255, 255, 255)",
                 icon_button_color="rgb(255, 255, 255)",
             )
-        #with gr.Column():
-            #image_input = gr.Image(
-                #label="Image", type="numpy", sources=["upload", "clipboard"]
-            #)
-        webrtc.stream(
-            GeminiHandler(),
-            inputs=[webrtc],
-            outputs=[webrtc],
-            time_limit=180 if get_space() else None,
-            concurrency_limit=2 if get_space() else None,
-        )
-stream.ui = demo
 if __name__ == "__main__":
-    if (mode := os.getenv("MODE")) == "UI":
-        stream.ui.launch(server_port=7860)
-    elif mode == "PHONE":
-        raise ValueError("Phone mode not supported for this demo")
-    else:
-        stream.ui.launch(server_port=7860)

     get_cloudflare_turn_credentials_async,
     wait_for_item,
 )
+from google import genai # Assuming this is from google-generativeai or compatible
+from google.generativeai.types import Tool, FunctionDeclaration, ToolConfig, Part # For Tool calling
 from gradio.utils import get_space
 from PIL import Image
+# For Google Search
+from googlesearch import search as google_search_engine
 load_dotenv()
     return {"mime_type": "image/jpeg", "data": base64_str}
+def perform_google_search(query: str, num_results: int = 3) -> dict:
+    """
+    Performs a Google search and returns a summary of results.
+    Args:
+        query: The search query.
+        num_results: The number of results to fetch.
+    Returns:
+        A dictionary suitable for Gemini's function response, containing
+        either a 'summary' of results or an 'error' message.
+    """
+    print(f"Performing Google search for: '{query}'...")
+    try:
+        search_results_links = []
+        # Using a loop to get a specific number of results as googlesearch is a generator
+        count = 0
+        for url in google_search_engine(query, num_results=num_results, stop=num_results, pause=1.0):
+            search_results_links.append(url)
+            count += 1
+            if count >= num_results:
+                break
+        if not search_results_links:
+            return {"summary": "No direct results found on the web. You could try rephrasing your search."}
+        # Prepare a summary for Gemini
+        summary_text = "Found the following links based on your search:\n" + "\n".join(search_results_links)
+        print(f"Search results: {summary_text}")
+        return {"summary": summary_text}
+    except Exception as e:
+        print(f"Google search error: {e}")
+        if "HTTP Error 429" in str(e) or "429" in str(e): # Handle rate limiting
+            return {"error": "The search service is temporarily busy (rate limited). Please try again in a moment."}
+        return {"error": f"An error occurred during the search: {str(e)}"}
 class GeminiHandler(AsyncAudioVideoStreamHandler):
     def __init__(
         self,
         self.last_frame_time = 0
         self.quit = asyncio.Event()
+        # Define the Google Search tool for Gemini
+        self.google_search_tool_declaration = FunctionDeclaration(
+            name="perform_google_search_for_user", # Name Gemini will use
+            description="Performs a Google search for a given query and returns a summary of the top results. Use this for general web searches.",
+            parameters={
+                "type": "OBJECT",
+                "properties": {
+                    "query": {"type": "STRING", "description": "The search query to look up on Google."}
+                },
+                "required": ["query"],
+            },
+        )
+        self.gemini_tools = [Tool(function_declarations=[self.google_search_tool_declaration])]
     def copy(self) -> "GeminiHandler":
         return GeminiHandler()
     async def start_up(self):
+        # Ensure GEMINI_API_KEY is set in your .env file or environment
+        api_key = os.getenv("GEMINI_API_KEY")
+        if not api_key:
+            raise ValueError("GEMINI_API_KEY not found in environment variables.")
+        # Using google.generativeai's standard client setup if `from google import genai` provides it
+        # If `genai.Client` is from a different library, this part might need adjustment.
+        # Assuming `genai.Client` is akin to `google.generativeai.GenerativeServiceAsyncClient`
+        # or a wrapper that `google-generativeai` provides.
+        # The original code uses `genai.Client(...)`. Let's try to adapt that.
+        # Removing `http_options` to use default (likely v1beta for tools)
+        # If `v1alpha` is strictly required by your `genai.Client`, tool support might be limited.
+        client = genai.Client(api_key=api_key)
+        # Configure Gemini for audio response and tool usage
+        config = {
+            "response_modalities": ["AUDIO"],
+            "tool_config": ToolConfig(function_declarations=[self.google_search_tool_declaration])
+        }
+        # Using a model known for good tool use and speed
+        model_name = "gemini-1.5-flash-latest"
+        print(f"Connecting to Gemini model: {model_name} with search tool enabled.")
+        try:
+            async with client.aio.live.connect(
+                model=model_name,
+                config=config,
+            ) as session:
+                self.session = session
+                print("Gemini session started successfully. You can now speak or ask to search.")
+                while not self.quit.is_set():
+                    current_turn = self.session.receive() # This gets a LiveTurn object
+                    try:
+                        # First, process any incoming audio chunks from Gemini for this turn
+                        async for response_chunk in current_turn: # Iterates over LiveResponseChunk
+                            if response_chunk.data: # This is audio data from Gemini
+                                audio = np.frombuffer(response_chunk.data, dtype=np.int16).reshape(1, -1)
+                                self.audio_queue.put_nowait(audio)
+                        # After processing all chunks, check if Gemini requested a tool call in this turn
+                        if current_turn.tool_code and current_turn.tool_code.function_call:
+                            fc = current_turn.tool_code.function_call
+                            tool_name = fc.name
+                            tool_args = fc.args
+                            if tool_name == "perform_google_search_for_user":
+                                query = tool_args.get("query")
+                                if not query:
+                                    print("Error: 'query' argument missing for search tool.")
+                                    tool_response_part = Part.from_function_response(
+                                        name=tool_name,
+                                        response={"error": "Missing 'query' argument for search."}
+                                    )
+                                else:
+                                    print(f"Gemini requested search: '{query}'")
+                                    # Run the blocking search function in a separate thread
+                                    search_result_dict = await asyncio.to_thread(perform_google_search, query)
+                                    tool_response_part = Part.from_function_response(
+                                        name=tool_name,
+                                        response=search_result_dict # Pass the dict directly
+                                    )
+                                print(f"Sending search tool response to Gemini: {search_result_dict}")
+                                await self.session.send(input=[tool_response_part])
+                            else:
+                                print(f"Error: Gemini requested unknown tool: {tool_name}")
+                                tool_error_response = Part.from_function_response(
+                                    name=tool_name,
+                                    response={"error": f"Tool '{tool_name}' is not implemented."}
+                                )
+                                await self.session.send(input=[tool_error_response])
+                    except websockets.exceptions.ConnectionClosedOK:
+                        print("WebSocket connection closed by Gemini server.")
+                        break
+                    except Exception as e:
+                        print(f"Error processing a turn from Gemini: {e}")
+                        # Decide if to break or continue based on error severity
+                        break # Safest to break on unexpected errors in the loop
+        except Exception as e:
+            print(f"Failed to connect or maintain Gemini session: {e}")
+            # Handle connection errors, API key issues, etc.
+        finally:
+            self.quit.set() # Ensure shutdown is triggered if loop exits
     async def video_receive(self, frame: np.ndarray):
         self.video_queue.put_nowait(frame)
+        if self.session and not self.session.closed:
+            if time.time() - self.last_frame_time > 1: # Send video frame every 1 second
                 self.last_frame_time = time.time()
+                try:
+                    await self.session.send(input=encode_image(frame))
+                    # latest_args[0] is webrtc component, latest_args[1] is image_input
+                    if self.latest_args and len(self.latest_args) > 1 and self.latest_args[1] is not None:
+                        await self.session.send(input=encode_image(self.latest_args[1]))
+                except Exception as e:
+                    print(f"Error sending video/image to Gemini: {e}")
     async def video_emit(self):
         frame = await wait_for_item(self.video_queue, 0.01)
         else:
             return np.zeros((100, 100, 3), dtype=np.uint8)
+    async def receive(self, frame: tuple[int, np.ndarray]) -> None: # Receives audio from user
         _, array = frame
         array = array.squeeze()
         audio_message = encode_audio(array)
+        if self.session and not self.session.closed:
+            try:
+                await self.session.send(input=audio_message)
+            except Exception as e:
+                print(f"Error sending audio to Gemini: {e}")
+    async def emit(self): # Emits audio from Gemini to user
         array = await wait_for_item(self.audio_queue, 0.01)
         if array is not None:
             return (self.output_sample_rate, array)
+        return None # Return None if no audio, as per fastrtc expectation
     async def shutdown(self) -> None:
+        print("Shutting down GeminiHandler...")
+        self.quit.set()
+        if self.session and not self.session.closed:
+            try:
+                await self.session.close()
+                print("Gemini session closed.")
+            except Exception as e:
+                print(f"Error closing Gemini session: {e}")
+        self.session = None
+# --- Gradio UI (largely unchanged) ---
+stream = Stream( # This Stream object is for the deprecated gr.Interface way
     handler=GeminiHandler(),
     modality="audio-video",
     mode="send-receive",
     rtc_configuration=get_cloudflare_turn_credentials_async,
     time_limit=180 if get_space() else None,
     additional_inputs=[
+        gr.Image(label="Optional Image Input", type="numpy", sources=["upload", "clipboard"])
     ],
     ui_args={
         "icon": "https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png",
         "pulse_color": "rgb(255, 255, 255)",
         "icon_button_color": "rgb(255, 255, 255)",
+        "title": "Gemini Audio Video Chat + Search",
     },
 )
 #video-source {max-width: 500px !important; max-height: 500px !important;}
 """
+with gr.Blocks(css=css, title="Gemini AV Chat + Search") as demo:
     gr.HTML(
         """
         <div>
           <center>
+            <h1>Gen AI Voice Chat with Google Search</h1>
+            <p>Real-time audio + video streaming, with integrated Google Search via Gemini.</p>
+            <p><small>Note: Search uses web scraping, which may be rate-limited or unreliable for heavy use. For production, use official APIs.</small></p>
           </center>
         </div>
     """
     )
+    # Additional input for an image (as in original `stream` object)
+    # This needs to be passed to the handler if you want it to be used.
+    # The `webrtc.stream` inputs must match what the handler expects in `self.latest_args`
+    image_input_component = gr.Image(label="Optional Image Input", type="numpy", sources=["upload", "clipboard"])
     with gr.Row() as row:
         with gr.Column():
             webrtc = WebRTC(
                 label="Video Chat",
+                modality="audio-video", # This component handles both audio and video from user
                 mode="send-receive",
                 elem_id="video-source",
                 rtc_configuration=get_cloudflare_turn_credentials_async,
                 pulse_color="rgb(255, 255, 255)",
                 icon_button_color="rgb(255, 255, 255)",
             )
+        # The image_input_component is now defined above the row for clarity
+    # The WebRTC component itself is the primary input for audio/video.
+    # The additional image input needs to be correctly wired.
+    # `webrtc.stream` will pass its inputs to the handler's `self.latest_args`.
+    # If webrtc is input[0] and image_input_component is input[1], then
+    # in GeminiHandler, self.latest_args[0] is webrtc data, self.latest_args[1] is image_input_component data.
+    webrtc.stream(
+        handler_class=GeminiHandler, # Pass the class, not an instance for gr.Blocks
+        inputs=[webrtc, image_input_component], # webrtc is audio/video, image_input_component for the static image
+        outputs=[webrtc], # webrtc for audio/video output from Gemini
+        time_limit=180 if get_space() else None,
+        concurrency_limit=2 if get_space() else None,
+    )
+# The `stream.ui = demo` line might be for an older way of launching.
+# For gr.Blocks, `demo.launch()` is standard.
+# If `fastrtc.Stream` is meant to wrap `gr.Blocks`, its usage might differ.
+# Assuming standard Gradio launch:
 if __name__ == "__main__":
+    if os.getenv("GEMINI_API_KEY") is None:
+        print("WARNING: GEMINI_API_KEY environment variable not set. The application may not work.")
+    # The original code had `stream.ui.launch()`. If `stream` is a `fastrtc.Stream` object,
+    # and it's meant to manage the Gradio app, then that's correct.
+    # If `demo` is the primary Gradio interface, then `demo.launch()` is used.
+    # Let's stick to the original pattern if `fastrtc.Stream` requires it.
+    # stream.ui = demo # This assignment might be specific to how fastrtc integrates
+    # The original structure seems to use fastrtc.Stream to build the UI, then replace its UI with gr.Blocks.
+    # This is a bit unusual. Let's check if fastrtc.Stream is used by WebRTC.
+    # The WebRTC.stream call implies it handles its own streaming logic with the handler.
+    # So `demo` should be the main UI.
+    demo.launch(server_port=7860, debug=True) # Added debug=True for development