gemini-webrtc

Sleeping

App Files Files Community

mgokg commited on May 15

Commit

cfcda0e

verified ·

1 Parent(s): c3cd47d

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -218

app.py CHANGED Viewed

@@ -15,15 +15,10 @@ from fastrtc import (
     get_cloudflare_turn_credentials_async,
     wait_for_item,
 )
-from google import genai # Assuming this is from google-generativeai or compatible
-from google.genai import types #import Tool, FunctionDeclaration, ToolConfig, Part
-#from google.generativeai.types import Tool, FunctionDeclaration, ToolConfig, Part # For Tool calling
 from gradio.utils import get_space
 from PIL import Image
-# For Google Search
-from googlesearch import search as google_search_engine
 load_dotenv()
@@ -44,42 +39,6 @@ def encode_image(data: np.ndarray) -> dict:
     return {"mime_type": "image/jpeg", "data": base64_str}
-def perform_google_search(query: str, num_results: int = 3) -> dict:
-    """
-    Performs a Google search and returns a summary of results.
-    Args:
-        query: The search query.
-        num_results: The number of results to fetch.
-    Returns:
-        A dictionary suitable for Gemini's function response, containing
-        either a 'summary' of results or an 'error' message.
-    """
-    print(f"Performing Google search for: '{query}'...")
-    try:
-        search_results_links = []
-        # Using a loop to get a specific number of results as googlesearch is a generator
-        count = 0
-        for url in google_search_engine(query, num_results=num_results, stop=num_results, pause=1.0):
-            search_results_links.append(url)
-            count += 1
-            if count >= num_results:
-                break
-        if not search_results_links:
-            return {"summary": "No direct results found on the web. You could try rephrasing your search."}
-        # Prepare a summary for Gemini
-        summary_text = "Found the following links based on your search:\n" + "\n".join(search_results_links)
-        print(f"Search results: {summary_text}")
-        return {"summary": summary_text}
-    except Exception as e:
-        print(f"Google search error: {e}")
-        if "HTTP Error 429" in str(e) or "429" in str(e): # Handle rate limiting
-            return {"error": "The search service is temporarily busy (rate limited). Please try again in a moment."}
-        return {"error": f"An error occurred during the search: {str(e)}"}
 class GeminiHandler(AsyncAudioVideoStreamHandler):
     def __init__(
         self,
@@ -95,127 +54,41 @@ class GeminiHandler(AsyncAudioVideoStreamHandler):
         self.last_frame_time = 0
         self.quit = asyncio.Event()
-        # Define the Google Search tool for Gemini
-        self.google_search_tool_declaration = FunctionDeclaration(
-            name="perform_google_search_for_user", # Name Gemini will use
-            description="Performs a Google search for a given query and returns a summary of the top results. Use this for general web searches.",
-            parameters={
-                "type": "OBJECT",
-                "properties": {
-                    "query": {"type": "STRING", "description": "The search query to look up on Google."}
-                },
-                "required": ["query"],
-            },
-        )
-        self.gemini_tools = [Tool(function_declarations=[self.google_search_tool_declaration])]
     def copy(self) -> "GeminiHandler":
         return GeminiHandler()
     async def start_up(self):
-        # Ensure GEMINI_API_KEY is set in your .env file or environment
-        api_key = os.getenv("GEMINI_API_KEY")
-        if not api_key:
-            raise ValueError("GEMINI_API_KEY not found in environment variables.")
-        # Using google.generativeai's standard client setup if `from google import genai` provides it
-        # If `genai.Client` is from a different library, this part might need adjustment.
-        # Assuming `genai.Client` is akin to `google.generativeai.GenerativeServiceAsyncClient`
-        # or a wrapper that `google-generativeai` provides.
-        # The original code uses `genai.Client(...)`. Let's try to adapt that.
-        # Removing `http_options` to use default (likely v1beta for tools)
-        # If `v1alpha` is strictly required by your `genai.Client`, tool support might be limited.
-        client = genai.Client(api_key=api_key)
-        # Configure Gemini for audio response and tool usage
-        config = {
-            "response_modalities": ["AUDIO"],
-            "tool_config": ToolConfig(function_declarations=[self.google_search_tool_declaration])
-        }
-        # Using a model known for good tool use and speed
-        model_name = "gemini-1.5-flash-latest"
-        print(f"Connecting to Gemini model: {model_name} with search tool enabled.")
-        try:
-            async with client.aio.live.connect(
-                model=model_name,
-                config=config,
-            ) as session:
-                self.session = session
-                print("Gemini session started successfully. You can now speak or ask to search.")
-                while not self.quit.is_set():
-                    current_turn = self.session.receive() # This gets a LiveTurn object
-                    try:
-                        # First, process any incoming audio chunks from Gemini for this turn
-                        async for response_chunk in current_turn: # Iterates over LiveResponseChunk
-                            if response_chunk.data: # This is audio data from Gemini
-                                audio = np.frombuffer(response_chunk.data, dtype=np.int16).reshape(1, -1)
-                                self.audio_queue.put_nowait(audio)
-                        # After processing all chunks, check if Gemini requested a tool call in this turn
-                        if current_turn.tool_code and current_turn.tool_code.function_call:
-                            fc = current_turn.tool_code.function_call
-                            tool_name = fc.name
-                            tool_args = fc.args
-                            if tool_name == "perform_google_search_for_user":
-                                query = tool_args.get("query")
-                                if not query:
-                                    print("Error: 'query' argument missing for search tool.")
-                                    tool_response_part = Part.from_function_response(
-                                        name=tool_name,
-                                        response={"error": "Missing 'query' argument for search."}
-                                    )
-                                else:
-                                    print(f"Gemini requested search: '{query}'")
-                                    # Run the blocking search function in a separate thread
-                                    search_result_dict = await asyncio.to_thread(perform_google_search, query)
-                                    tool_response_part = Part.from_function_response(
-                                        name=tool_name,
-                                        response=search_result_dict # Pass the dict directly
-                                    )
-                                print(f"Sending search tool response to Gemini: {search_result_dict}")
-                                await self.session.send(input=[tool_response_part])
-                            else:
-                                print(f"Error: Gemini requested unknown tool: {tool_name}")
-                                tool_error_response = Part.from_function_response(
-                                    name=tool_name,
-                                    response={"error": f"Tool '{tool_name}' is not implemented."}
-                                )
-                                await self.session.send(input=[tool_error_response])
-                    except websockets.exceptions.ConnectionClosedOK:
-                        print("WebSocket connection closed by Gemini server.")
-                        break
-                    except Exception as e:
-                        print(f"Error processing a turn from Gemini: {e}")
-                        # Decide if to break or continue based on error severity
-                        break # Safest to break on unexpected errors in the loop
-        except Exception as e:
-            print(f"Failed to connect or maintain Gemini session: {e}")
-            # Handle connection errors, API key issues, etc.
-        finally:
-            self.quit.set() # Ensure shutdown is triggered if loop exits
     async def video_receive(self, frame: np.ndarray):
         self.video_queue.put_nowait(frame)
-        if self.session and not self.session.closed:
-            if time.time() - self.last_frame_time > 1: # Send video frame every 1 second
                 self.last_frame_time = time.time()
-                try:
-                    await self.session.send(input=encode_image(frame))
-                    # latest_args[0] is webrtc component, latest_args[1] is image_input
-                    if self.latest_args and len(self.latest_args) > 1 and self.latest_args[1] is not None:
-                        await self.session.send(input=encode_image(self.latest_args[1]))
-                except Exception as e:
-                    print(f"Error sending video/image to Gemini: {e}")
     async def video_emit(self):
         frame = await wait_for_item(self.video_queue, 0.01)
@@ -224,50 +97,40 @@ class GeminiHandler(AsyncAudioVideoStreamHandler):
         else:
             return np.zeros((100, 100, 3), dtype=np.uint8)
-    async def receive(self, frame: tuple[int, np.ndarray]) -> None: # Receives audio from user
         _, array = frame
         array = array.squeeze()
         audio_message = encode_audio(array)
-        if self.session and not self.session.closed:
-            try:
-                await self.session.send(input=audio_message)
-            except Exception as e:
-                print(f"Error sending audio to Gemini: {e}")
-    async def emit(self): # Emits audio from Gemini to user
         array = await wait_for_item(self.audio_queue, 0.01)
         if array is not None:
             return (self.output_sample_rate, array)
-        return None # Return None if no audio, as per fastrtc expectation
     async def shutdown(self) -> None:
-        print("Shutting down GeminiHandler...")
-        self.quit.set()
-        if self.session and not self.session.closed:
-            try:
-                await self.session.close()
-                print("Gemini session closed.")
-            except Exception as e:
-                print(f"Error closing Gemini session: {e}")
-        self.session = None
-# --- Gradio UI (largely unchanged) ---
-stream = Stream( # This Stream object is for the deprecated gr.Interface way
     handler=GeminiHandler(),
     modality="audio-video",
     mode="send-receive",
     rtc_configuration=get_cloudflare_turn_credentials_async,
     time_limit=180 if get_space() else None,
     additional_inputs=[
-        gr.Image(label="Optional Image Input", type="numpy", sources=["upload", "clipboard"])
     ],
     ui_args={
         "icon": "https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png",
         "pulse_color": "rgb(255, 255, 255)",
         "icon_button_color": "rgb(255, 255, 255)",
-        "title": "Gemini Audio Video Chat + Search",
     },
 )
@@ -275,28 +138,22 @@ css = """
 #video-source {max-width: 500px !important; max-height: 500px !important;}
 """
-with gr.Blocks(css=css, title="Gemini AV Chat + Search") as demo:
     gr.HTML(
         """
         <div>
           <center>
-            <h1>Gen AI Voice Chat with Google Search</h1>
-            <p>Real-time audio + video streaming, with integrated Google Search via Gemini.</p>
-            <p><small>Note: Search uses web scraping, which may be rate-limited or unreliable for heavy use. For production, use official APIs.</small></p>
           </center>
         </div>
     """
     )
-    # Additional input for an image (as in original `stream` object)
-    # This needs to be passed to the handler if you want it to be used.
-    # The `webrtc.stream` inputs must match what the handler expects in `self.latest_args`
-    image_input_component = gr.Image(label="Optional Image Input", type="numpy", sources=["upload", "clipboard"])
     with gr.Row() as row:
         with gr.Column():
             webrtc = WebRTC(
                 label="Video Chat",
-                modality="audio-video", # This component handles both audio and video from user
                 mode="send-receive",
                 elem_id="video-source",
                 rtc_configuration=get_cloudflare_turn_credentials_async,
@@ -304,39 +161,26 @@ with gr.Blocks(css=css, title="Gemini AV Chat + Search") as demo:
                 pulse_color="rgb(255, 255, 255)",
                 icon_button_color="rgb(255, 255, 255)",
             )
-        # The image_input_component is now defined above the row for clarity
-    # The WebRTC component itself is the primary input for audio/video.
-    # The additional image input needs to be correctly wired.
-    # `webrtc.stream` will pass its inputs to the handler's `self.latest_args`.
-    # If webrtc is input[0] and image_input_component is input[1], then
-    # in GeminiHandler, self.latest_args[0] is webrtc data, self.latest_args[1] is image_input_component data.
-    webrtc.stream(
-        handler_class=GeminiHandler, # Pass the class, not an instance for gr.Blocks
-        inputs=[webrtc, image_input_component], # webrtc is audio/video, image_input_component for the static image
-        outputs=[webrtc], # webrtc for audio/video output from Gemini
-        time_limit=180 if get_space() else None,
-        concurrency_limit=2 if get_space() else None,
-    )
-# The `stream.ui = demo` line might be for an older way of launching.
-# For gr.Blocks, `demo.launch()` is standard.
-# If `fastrtc.Stream` is meant to wrap `gr.Blocks`, its usage might differ.
-# Assuming standard Gradio launch:
 if __name__ == "__main__":
-    if os.getenv("GEMINI_API_KEY") is None:
-        print("WARNING: GEMINI_API_KEY environment variable not set. The application may not work.")
-    # The original code had `stream.ui.launch()`. If `stream` is a `fastrtc.Stream` object,
-    # and it's meant to manage the Gradio app, then that's correct.
-    # If `demo` is the primary Gradio interface, then `demo.launch()` is used.
-    # Let's stick to the original pattern if `fastrtc.Stream` requires it.
-    # stream.ui = demo # This assignment might be specific to how fastrtc integrates
-    # The original structure seems to use fastrtc.Stream to build the UI, then replace its UI with gr.Blocks.
-    # This is a bit unusual. Let's check if fastrtc.Stream is used by WebRTC.
-    # The WebRTC.stream call implies it handles its own streaming logic with the handler.
-    # So `demo` should be the main UI.
-    demo.launch(server_port=7860, debug=True) # Added debug=True for development

     get_cloudflare_turn_credentials_async,
     wait_for_item,
 )
+from google import genai
 from gradio.utils import get_space
 from PIL import Image
 load_dotenv()
     return {"mime_type": "image/jpeg", "data": base64_str}
 class GeminiHandler(AsyncAudioVideoStreamHandler):
     def __init__(
         self,
         self.last_frame_time = 0
         self.quit = asyncio.Event()
     def copy(self) -> "GeminiHandler":
         return GeminiHandler()
     async def start_up(self):
+        client = genai.Client(
+            api_key=os.getenv("GEMINI_API_KEY"), http_options={"api_version": "v1alpha"}
+        )
+        config = {"response_modalities": ["AUDIO"]}
+        async with client.aio.live.connect(
+            model="gemini-2.0-flash-exp",
+            config=config,  # type: ignore
+        ) as session:
+            self.session = session
+            while not self.quit.is_set():
+                turn = self.session.receive()
+                try:
+                    async for response in turn:
+                        if data := response.data:
+                            audio = np.frombuffer(data, dtype=np.int16).reshape(1, -1)
+                        self.audio_queue.put_nowait(audio)
+                except websockets.exceptions.ConnectionClosedOK:
+                    print("connection closed")
+                    break
     async def video_receive(self, frame: np.ndarray):
         self.video_queue.put_nowait(frame)
+        if self.session:
+            # send image every 1 second
+            print(time.time() - self.last_frame_time)
+            if time.time() - self.last_frame_time > 1:
                 self.last_frame_time = time.time()
+                await self.session.send(input=encode_image(frame))
+                if self.latest_args[1] is not None:
+                    await self.session.send(input=encode_image(self.latest_args[1]))
     async def video_emit(self):
         frame = await wait_for_item(self.video_queue, 0.01)
         else:
             return np.zeros((100, 100, 3), dtype=np.uint8)
+    async def receive(self, frame: tuple[int, np.ndarray]) -> None:
         _, array = frame
         array = array.squeeze()
         audio_message = encode_audio(array)
+        if self.session:
+            await self.session.send(input=audio_message)
+    async def emit(self):
         array = await wait_for_item(self.audio_queue, 0.01)
         if array is not None:
             return (self.output_sample_rate, array)
+        return array
     async def shutdown(self) -> None:
+        if self.session:
+            self.quit.set()
+            await self.session.close()
+            self.quit.clear()
+stream = Stream(
     handler=GeminiHandler(),
     modality="audio-video",
     mode="send-receive",
     rtc_configuration=get_cloudflare_turn_credentials_async,
     time_limit=180 if get_space() else None,
     additional_inputs=[
+        gr.Image(label="Image", type="numpy", sources=["upload", "clipboard"])
     ],
     ui_args={
         "icon": "https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png",
         "pulse_color": "rgb(255, 255, 255)",
         "icon_button_color": "rgb(255, 255, 255)",
+        "title": "Gemini Audio Video Chat",
     },
 )
 #video-source {max-width: 500px !important; max-height: 500px !important;}
 """
+with gr.Blocks(css=css) as demo:
     gr.HTML(
         """
         <div>
           <center>
+            <h1>Gen AI Voice Chat</h1>
+            <p>real-time audio + video streaming</p>
           </center>
         </div>
     """
     )
     with gr.Row() as row:
         with gr.Column():
             webrtc = WebRTC(
                 label="Video Chat",
+                modality="audio-video",
                 mode="send-receive",
                 elem_id="video-source",
                 rtc_configuration=get_cloudflare_turn_credentials_async,
                 pulse_color="rgb(255, 255, 255)",
                 icon_button_color="rgb(255, 255, 255)",
             )
+        #with gr.Column():
+            #image_input = gr.Image(
+                #label="Image", type="numpy", sources=["upload", "clipboard"]
+            #)
+        webrtc.stream(
+            GeminiHandler(),
+            inputs=[webrtc],
+            outputs=[webrtc],
+            time_limit=180 if get_space() else None,
+            concurrency_limit=2 if get_space() else None,
+        )
+stream.ui = demo
 if __name__ == "__main__":
+    if (mode := os.getenv("MODE")) == "UI":
+        stream.ui.launch(server_port=7860)
+    elif mode == "PHONE":
+        raise ValueError("Phone mode not supported for this demo")
+    else:
+        stream.ui.launch(server_port=7860)