gemini-webrtc

Running

App Files Files Community

mgokg commited on May 15

Commit

3d9d966

verified ·

1 Parent(s): d08d2ca

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -192

app.py CHANGED Viewed

@@ -3,7 +3,6 @@ import base64
 import os
 import time
 from io import BytesIO
-import functools # Added for to_thread
 import gradio as gr
 import numpy as np
@@ -17,72 +16,11 @@ from fastrtc import (
     wait_for_item,
 )
 from google import genai
-# Ensure genai.protos is accessible for Tool, FunctionDeclaration etc.
-# If not, you might need from google.generativeai.types import Tool, FunctionDeclaration, Schema, Part, Content
-# However, with live.connect using v1alpha, direct proto usage is often needed.
-from google.generativeai import protos # Explicitly import protos
 from gradio.utils import get_space
 from PIL import Image
-from googleapiclient.discovery import build # Added for Google Search
 load_dotenv()
-# --- Environment Variables for Google Search ---
-GOOGLE_SEARCH_API_KEY = os.getenv("GOOGLE_SEARCH_API_KEY")
-GOOGLE_CSE_ID = os.getenv("GOOGLE_CSE_ID")
-# --- Google Search Function ---
-async def perform_google_search_async(query: str, num_results: int = 3) -> str:
-    """
-    Performs a Google search using the Custom Search API and returns formatted results.
-    """
-    if not GOOGLE_SEARCH_API_KEY or not GOOGLE_CSE_ID:
-        print("Google Search API key or CSE ID not configured.")
-        return "Search functionality is not configured."
-    try:
-        loop = asyncio.get_running_loop()
-        # Create a partial function for the blocking call
-        partial_search = functools.partial(
-            build("customsearch", "v1", developerKey=GOOGLE_SEARCH_API_KEY).cse().list(
-                q=query, cx=GOOGLE_CSE_ID, num=num_results
-            ).execute
-        )
-        # Run the blocking call in a thread pool
-        res = await loop.run_in_executor(None, partial_search)
-        if 'items' in res and res['items']:
-            results = []
-            for item in res['items']:
-                title = item.get('title', 'N/A')
-                link = item.get('link', 'N/A')
-                snippet = item.get('snippet', 'N/A').replace("\n", " ")
-                results.append(f"Title: {title}\nLink: {link}\nSnippet: {snippet}\n---")
-            return "\n".join(results)
-        else:
-            return "No search results found."
-    except Exception as e:
-        print(f"Error during Google Search: {e}")
-        return f"An error occurred while searching: {str(e)}"
-# --- Define the Google Search Tool for Gemini ---
-# Using genai.protos directly as LiveSession client might expect raw protos
-google_search_tool = protos.Tool(
-    function_declarations=[
-        protos.FunctionDeclaration(
-            name="perform_google_search",
-            description="Performs a Google search for a given query and returns a summary of the top results. Use this for general web searches or finding specific information online.",
-            parameters=protos.Schema(
-                type=protos.Type.OBJECT,
-                properties={
-                    "query": protos.Schema(type=protos.Type.STRING, description="The search query to use for Google Search."),
-                    "num_results": protos.Schema(type=protos.Type.NUMBER, description="Optional. Number of search results to return (default is 3).")
-                },
-                required=["query"]
-            )
-        )
-    ]
-)
 def encode_audio(data: np.ndarray) -> dict:
     """Encode Audio data to send to the server"""
@@ -115,148 +53,68 @@ class GeminiHandler(AsyncAudioVideoStreamHandler):
         self.session = None
         self.last_frame_time = 0
         self.quit = asyncio.Event()
-        self.client = None # Store client
     def copy(self) -> "GeminiHandler":
         return GeminiHandler()
     async def start_up(self):
-        self.client = genai.Client( # Use self.client
             api_key=os.getenv("GEMINI_API_KEY"), http_options={"api_version": "v1alpha"}
         )
-        # Configure Gemini to use the search tool
-        # Note: For v1alpha live client, config might be a dict or protos.StreamingConfig
-        # protos.ToolConfig and protos.FunctionCallingConfig might be needed for more control
-        # e.g. tool_config=protos.ToolConfig(function_calling_config=protos.FunctionCallingConfig(mode=protos.FunctionCallingConfig.Mode.ANY))
-        streaming_config = protos.StreamingConfig(
-            response_modalities=[protos.ResponseModality.AUDIO], # Use enum
-            tools=[google_search_tool]
-        )
-        # If you need to force tool usage or set mode:
-        # streaming_config.tool_config.CopyFrom(protos.ToolConfig(
-        #    function_calling_config=protos.FunctionCallingConfig(mode=protos.FunctionCallingConfig.Mode.ANY)
-        # ))
-        async with self.client.aio.live.connect(
-            model="gemini-2.0-flash-exp", # Or "gemini-1.5-flash-latest" which is known to support tools well
-            config=streaming_config,
         ) as session:
             self.session = session
-            print("Gemini session started.")
             while not self.quit.is_set():
                 turn = self.session.receive()
                 try:
-                    async for response_proto in turn: # response_proto is protos.Response
-                        # Check for function calls
-                        if response_proto.function_call and response_proto.function_call.name:
-                            fc = response_proto.function_call
-                            if fc.name == "perform_google_search":
-                                query = fc.args["query"]
-                                num_results = fc.args.get("num_results", 3) # Get optional num_results
-                                print(f"Gemini requested Google search for: '{query}' with {num_results} results.")
-                                search_results_text = await perform_google_search_async(query, int(num_results))
-                                print(f"Search results (first 200 chars): {search_results_text[:200]}...")
-                                # Send search results back to Gemini
-                                function_response_proto = protos.FunctionResponse(
-                                    name="perform_google_search",
-                                    response={"result": search_results_text} # Response must be a dict/struct
-                                )
-                                input_proto = protos.Input(function_response=function_response_proto)
-                                await self.session.send(input=input_proto)
-                                print("Sent search results back to Gemini.")
-                        # Handle audio data
-                        elif response_proto.audio_output and response_proto.audio_output.data:
-                            data = response_proto.audio_output.data
                             audio = np.frombuffer(data, dtype=np.int16).reshape(1, -1)
-                            self.audio_queue.put_nowait(audio)
-                        # You could also handle response_proto.text_output if needed
                 except websockets.exceptions.ConnectionClosedOK:
-                    print("Gemini session connection closed normally.")
                     break
-                except Exception as e:
-                    print(f"Error in Gemini session receive loop: {e}")
-                    # Consider how to handle errors, e.g., break or log and continue
-                    break # For now, break on error
-            print("Exited Gemini session receive loop.")
     async def video_receive(self, frame: np.ndarray):
         self.video_queue.put_nowait(frame)
         if self.session:
-            current_time = time.time()
-            if current_time - self.last_frame_time > 1: # Send image every 1 second
-                self.last_frame_time = current_time
-                # The original code sends a dict. For v1alpha, it might need to be wrapped in protos.Input
-                # For simplicity, keeping as dict and assuming SDK handles it.
-                # If issues, wrap: image_part = protos.Part(inline_data=protos.Blob(mime_type="image/jpeg", data=...))
-                # input_proto = protos.Input(parts=[image_part])
-                # await self.session.send(input=input_proto)
                 await self.session.send(input=encode_image(frame))
-                # Handle additional image input from Gradio UI
-                if self.latest_args and len(self.latest_args) > 1 and self.latest_args[1] is not None:
-                    # Assuming self.latest_args[1] is the numpy array from the gr.Image input
-                    uploaded_image_data = self.latest_args[1]
-                    await self.session.send(input=encode_image(uploaded_image_data))
-                    # To avoid resending, you might want to clear it after sending
-                    # self.latest_args[1] = None # Or handle state more robustly
     async def video_emit(self):
         frame = await wait_for_item(self.video_queue, 0.01)
         if frame is not None:
             return frame
         else:
-            return np.zeros((100, 100, 3), dtype=np.uint8) # Default blank frame
     async def receive(self, frame: tuple[int, np.ndarray]) -> None:
-        # Audio from user's microphone
         _, array = frame
         array = array.squeeze()
-        audio_message_dict = encode_audio(array) # This is a dict
         if self.session:
-            # For v1alpha, input should be protos.Input.
-            # The SDK might convert the dict, but explicit is safer.
-            audio_data_bytes = base64.b64decode(audio_message_dict["data"])
-            audio_part = protos.Part(
-                audio_input=protos.AudioData(
-                    audio=audio_data_bytes,
-                    # sample_rate_hertz=self.input_sample_rate # If API needs it
-                )
-            )
-            input_proto = protos.Input(parts=[audio_part])
-            await self.session.send(input=input_proto)
     async def emit(self):
-        # Audio to user's speakers (from Gemini)
         array = await wait_for_item(self.audio_queue, 0.01)
         if array is not None:
             return (self.output_sample_rate, array)
-        return None # Return None if no audio, Gradio handles it
     async def shutdown(self) -> None:
-        print("Shutting down GeminiHandler...")
         if self.session:
             self.quit.set()
-            try:
-                await self.session.close()
-                print("Gemini session closed.")
-            except Exception as e:
-                print(f"Error closing Gemini session: {e}")
-        self.quit.clear()
-        # Clean up client if necessary, though it's managed by 'async with' in start_up
-        self.client = None
 stream = Stream(
@@ -272,7 +130,7 @@ stream = Stream(
         "icon": "https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png",
         "pulse_color": "rgb(255, 255, 255)",
         "icon_button_color": "rgb(255, 255, 255)",
-        "title": "Gemini Audio Video Chat with Search", # Updated title
     },
 )
@@ -283,18 +141,12 @@ css = """
 with gr.Blocks(css=css) as demo:
     gr.HTML(
         """
-    <div style='display: flex; align-items: center; justify-content: center; gap: 20px'>
-        <div style="background-color: var(--block-background-fill); border-radius: 8px">
-            <img src="https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png" style="width: 100px; height: 100px;">
-        </div>
         <div>
-            <h1>Gen AI SDK Voice Chat with Google Search</h1>
-            <p>Speak with Gemini using real-time audio + video streaming, now with Google Search capability!</p>
-            <p>Try saying: "Search for the weather in London" or "Google the latest AI news."</p>
-            <p>Powered by <a href="https://gradio.app/">Gradio</a> and <a href=https://freddyaboulton.github.io/gradio-webrtc/">WebRTC</a>⚡️</p>
-            <p>Get a Gemini API Key <a href="https://aistudio.google.com/app/apikey">here</a>. You'll also need a Google Search API Key and CSE ID.</p>
-        </div>
-    </div>
     """
     )
     with gr.Row() as row:
@@ -309,17 +161,14 @@ with gr.Blocks(css=css) as demo:
                 pulse_color="rgb(255, 255, 255)",
                 icon_button_color="rgb(255, 255, 255)",
             )
-        with gr.Column():
-            image_input = gr.Image(
-                label="Image (optional, sent with video frames)", type="numpy", sources=["upload", "clipboard"]
-            )
-        # The WebRTC.stream method will pass these inputs to the handler's methods.
-        # The handler's __init__ or other methods might need to store/access `image_input` if needed beyond `latest_args`.
-        # The `latest_args` in `video_receive` comes from the `inputs` list here.
         webrtc.stream(
-            GeminiHandler(), # A new instance of GeminiHandler for each stream session
-            inputs=[webrtc, image_input], # webrtc is args[0], image_input is args[1]
             outputs=[webrtc],
             time_limit=180 if get_space() else None,
             concurrency_limit=2 if get_space() else None,
@@ -329,16 +178,9 @@ stream.ui = demo
 if __name__ == "__main__":
-    if not os.getenv("GEMINI_API_KEY"):
-        print("GEMINI_API_KEY not found in environment variables. Please set it in your .env file.")
-    if not GOOGLE_SEARCH_API_KEY or not GOOGLE_CSE_ID:
-        print("GOOGLE_SEARCH_API_KEY or GOOGLE_CSE_ID not found. Search functionality will be limited.")
     if (mode := os.getenv("MODE")) == "UI":
         stream.ui.launch(server_port=7860)
     elif mode == "PHONE":
         raise ValueError("Phone mode not supported for this demo")
     else:
-        # Default to UI launch if MODE is not set or unrecognized
-        print("Launching Gradio UI...")
-        stream.ui.launch(server_port=7860)

 import os
 import time
 from io import BytesIO
 import gradio as gr
 import numpy as np
     wait_for_item,
 )
 from google import genai
 from gradio.utils import get_space
 from PIL import Image
 load_dotenv()
 def encode_audio(data: np.ndarray) -> dict:
     """Encode Audio data to send to the server"""
         self.session = None
         self.last_frame_time = 0
         self.quit = asyncio.Event()
     def copy(self) -> "GeminiHandler":
         return GeminiHandler()
     async def start_up(self):
+        client = genai.Client(
             api_key=os.getenv("GEMINI_API_KEY"), http_options={"api_version": "v1alpha"}
         )
+        config = {"response_modalities": ["AUDIO"]}
+        async with client.aio.live.connect(
+            model="gemini-2.0-flash-exp",
+            config=config,  # type: ignore
         ) as session:
             self.session = session
             while not self.quit.is_set():
                 turn = self.session.receive()
                 try:
+                    async for response in turn:
+                        if data := response.data:
                             audio = np.frombuffer(data, dtype=np.int16).reshape(1, -1)
+                        self.audio_queue.put_nowait(audio)
                 except websockets.exceptions.ConnectionClosedOK:
+                    print("connection closed")
                     break
     async def video_receive(self, frame: np.ndarray):
         self.video_queue.put_nowait(frame)
         if self.session:
+            # send image every 1 second
+            print(time.time() - self.last_frame_time)
+            if time.time() - self.last_frame_time > 1:
+                self.last_frame_time = time.time()
                 await self.session.send(input=encode_image(frame))
+                if self.latest_args[1] is not None:
+                    await self.session.send(input=encode_image(self.latest_args[1]))
     async def video_emit(self):
         frame = await wait_for_item(self.video_queue, 0.01)
         if frame is not None:
             return frame
         else:
+            return np.zeros((100, 100, 3), dtype=np.uint8)
     async def receive(self, frame: tuple[int, np.ndarray]) -> None:
         _, array = frame
         array = array.squeeze()
+        audio_message = encode_audio(array)
         if self.session:
+            await self.session.send(input=audio_message)
     async def emit(self):
         array = await wait_for_item(self.audio_queue, 0.01)
         if array is not None:
             return (self.output_sample_rate, array)
+        return array
     async def shutdown(self) -> None:
         if self.session:
             self.quit.set()
+            await self.session.close()
+            self.quit.clear()
 stream = Stream(
         "icon": "https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png",
         "pulse_color": "rgb(255, 255, 255)",
         "icon_button_color": "rgb(255, 255, 255)",
+        "title": "Gemini Audio Video Chat",
     },
 )
 with gr.Blocks(css=css) as demo:
     gr.HTML(
         """
         <div>
+          <center>
+            <h1>Gen AI Voice Chat</h1>
+            <p>real-time audio + video streaming</p>
+          </center>
+        </div>
     """
     )
     with gr.Row() as row:
                 pulse_color="rgb(255, 255, 255)",
                 icon_button_color="rgb(255, 255, 255)",
             )
+        #with gr.Column():
+            #image_input = gr.Image(
+                #label="Image", type="numpy", sources=["upload", "clipboard"]
+            #)
         webrtc.stream(
+            GeminiHandler(),
+            inputs=[webrtc],
             outputs=[webrtc],
             time_limit=180 if get_space() else None,
             concurrency_limit=2 if get_space() else None,
 if __name__ == "__main__":
     if (mode := os.getenv("MODE")) == "UI":
         stream.ui.launch(server_port=7860)
     elif mode == "PHONE":
         raise ValueError("Phone mode not supported for this demo")
     else:
+        stream.ui.launch(server_port=7860)