gemini-webrtc

Running

App Files Files Community

mgokg commited on May 15

Commit

1208a72

verified ·

1 Parent(s): 9dc8e78

Update app.py

Browse files

Files changed (1) hide show

app.py +165 -817

app.py CHANGED Viewed

@@ -1,843 +1,191 @@
-import os
 import asyncio
 import base64
-import io
-import traceback
-import cv2
-import pyaudio
-import PIL.Image
-import mss
-import mss.tools
 import gradio as gr
 from google import genai
-from google.genai import types
-# --- Configuration ---
-GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
-if not GEMINI_API_KEY:
-    raise ValueError("GEMINI_API_KEY environment variable not set.")
-# Audio settings
-PYAUDIO_FORMAT = pyaudio.paInt16
-CHANNELS = 1
-SEND_SAMPLE_RATE = 16000  # Sample rate for audio sent to Gemini
-RECEIVE_SAMPLE_RATE = 24000 # Sample rate for audio received from Gemini (Puck voice)
-CHUNK_SIZE = 1024
-# Gemini Model
-MODEL_NAME = "models/gemini-2.0-flash-exp" # Updated model name
-# Streaming Modes
-VIDEO_MODE_CAMERA = "camera"
-VIDEO_MODE_SCREEN = "screen"
-VIDEO_MODE_NONE = "none" # Added for audio/text only
-DEFAULT_VIDEO_MODE = VIDEO_MODE_CAMERA
-# --- Gemini Client Initialization ---
-# Moved client initialization to where it's used to avoid global state if possible,
-# or ensure it's done once. For this script structure, global is fine.
-genai.configure(api_key=GEMINI_API_KEY)
-# --- GeminiStreamingClient Class ---
-class GeminiStreamingClient:
-    def __init__(self, video_mode=DEFAULT_VIDEO_MODE,
-                 on_text_received=None, on_audio_received=None, on_error=None):
-        self.video_mode = video_mode
-        self.on_text_received = on_text_received
-        self.on_audio_received = on_audio_received
-        self.on_error = on_error
-        self.pya = pyaudio.PyAudio()
-        self.session = None
-        self.audio_in_queue = asyncio.Queue() # For audio from Gemini to be played
-        self.media_out_queue = asyncio.Queue(maxsize=10) # For audio/video to Gemini
-        self.tasks = []
-        self.is_running = False
-        self.mic_stream = None
-        self.speaker_stream = None
-        self.capture_device = None # For camera
-        self.genai_client = genai.GenerativeModel(
-            MODEL_NAME,
-            system_instruction=types.Content(
-                parts=[types.Part.from_text(text="Du bist ein hilfreicher Assistent. Antworte immer auf Deutsch.")],
-                role="user" # System instructions are typically role 'user' or 'model' then 'user'
-            )
         )
-        # Note: LiveConnectConfig is not directly used with GenerativeModel.generate_content(stream=True)
-        # The equivalent features (voice, tools) are configured differently or might not all be available
-        # in the same way as the older Live API. For this example, we focus on text, audio in/out, and image in.
-        # SpeechConfig for output voice is handled by the model if it supports audio output.
-        # For input audio, we send PCM.
-    async def _get_frame_bytes(self):
-        if not self.capture_device or not self.capture_device.isOpened():
-            print("Camera not initialized or opened.")
-            await asyncio.sleep(1) # Prevent tight loop if camera fails
-            return None
-        ret, frame = await asyncio.to_thread(self.capture_device.read)
-        if not ret:
-            print("Failed to grab frame from camera.")
-            return None
-        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-        img = PIL.Image.fromarray(frame_rgb)
-        img.thumbnail((1024, 1024)) # Resize
-        image_io = io.BytesIO()
-        img.save(image_io, format="JPEG")
-        image_io.seek(0)
-        return {"mime_type": "image/jpeg", "data": base64.b64encode(image_io.read()).decode()}
-    async def _get_screen_bytes(self):
-        with mss.mss() as sct:
-            monitor = sct.monitors[1] # Primary monitor
-            sct_img = sct.grab(monitor)
-            img = PIL.Image.frombytes("RGB", sct_img.size, sct_img.rgb, "raw", "RGB")
-            image_io = io.BytesIO()
-            img.save(image_io, format="JPEG")
-            image_io.seek(0)
-            return {"mime_type": "image/jpeg", "data": base64.b64encode(image_io.read()).decode()}
-    async def _stream_visual_media(self):
-        if self.video_mode == VIDEO_MODE_CAMERA:
-            self.capture_device = await asyncio.to_thread(cv2.VideoCapture, 0)
-            if not self.capture_device.isOpened():
-                print("Error: Could not open camera.")
-                if self.on_error: self.on_error("Could not open camera.")
-                return
-            get_media_bytes = self._get_frame_bytes
-        elif self.video_mode == VIDEO_MODE_SCREEN:
-            get_media_bytes = self._get_screen_bytes
-        else: # VIDEO_MODE_NONE or unknown
-            return # No visual media to stream
-        while self.is_running:
-            try:
-                media_data = await get_media_bytes()
-                if media_data:
-                    await self.media_out_queue.put(media_data)
-                await asyncio.sleep(1.0)  # Capture frame every second
-            except Exception as e:
-                print(f"Error in visual media stream: {e}")
-                traceback.print_exc()
-                if self.on_error: self.on_error(f"Visual media stream error: {e}")
-                await asyncio.sleep(1) # Avoid tight loop on error
-        if self.video_mode == VIDEO_MODE_CAMERA and self.capture_device:
-            self.capture_device.release()
-            self.capture_device = None
-    async def _listen_microphone(self):
-        try:
-            mic_info = self.pya.get_default_input_device_info()
-            self.mic_stream = self.pya.open(
-                format=PYAUDIO_FORMAT,
-                channels=CHANNELS,
-                rate=SEND_SAMPLE_RATE,
-                input=True,
-                input_device_index=mic_info["index"],
-                frames_per_buffer=CHUNK_SIZE,
-            )
-        except Exception as e:
-            print(f"Error opening microphone stream: {e}")
-            if self.on_error: self.on_error(f"Microphone error: {e}")
-            return
-        print("Microphone listener started.")
-        while self.is_running and self.mic_stream:
-            try:
-                data = await asyncio.to_thread(self.mic_stream.read, CHUNK_SIZE, exception_on_overflow=False)
-                await self.media_out_queue.put({"mime_type": "audio/pcm", "data": data, "sample_rate": SEND_SAMPLE_RATE})
-            except IOError as e: # Stream closed or other issue
-                if e.errno == pyaudio.paInputOverflowed:
-                    print("Input overflowed. Skipping.")
-                    continue
-                print(f"Error reading from microphone: {e}")
-                if self.on_error: self.on_error(f"Mic read error: {e}")
-                break # Exit loop on significant error
-            except Exception as e:
-                print(f"Unexpected error in microphone listener: {e}")
-                traceback.print_exc()
-                if self.on_error: self.on_error(f"Mic listener error: {e}")
-                break
-        print("Microphone listener stopped.")
-    async def _play_gemini_audio(self):
-        try:
-            self.speaker_stream = self.pya.open(
-                format=PYAUDIO_FORMAT,
-                channels=CHANNELS,
-                rate=RECEIVE_SAMPLE_RATE,
-                output=True
-            )
-        except Exception as e:
-            print(f"Error opening speaker stream: {e}")
-            if self.on_error: self.on_error(f"Speaker error: {e}")
-            return
-        print("Audio playback started.")
-        while self.is_running or not self.audio_in_queue.empty(): # Process remaining queue even if stopping
-            try:
-                audio_chunk = await self.audio_in_queue.get()
-                if audio_chunk is None: # Sentinel for stopping
-                    break
-                if self.speaker_stream:
-                    await asyncio.to_thread(self.speaker_stream.write, audio_chunk)
-                self.audio_in_queue.task_done()
-            except Exception as e:
-                print(f"Error playing audio: {e}")
-                if self.on_error: self.on_error(f"Audio playback error: {e}")
-                # Don't break the loop for playback errors, just log and continue
-        print("Audio playback stopped.")
-    async def _process_gemini_responses(self):
-        print("Starting to process Gemini responses...")
-        try:
-            # The new API uses generate_content with a stream of Parts
-            # We need to build up the content to send.
-            # This part needs careful handling of how media_out_queue items are consumed.
-            # For a continuous conversation, we'd typically send an initial prompt,
-            # then subsequent inputs (audio/video/text) as parts of the ongoing conversation.
-            # This simplified model sends one "turn" at a time based on media_out_queue.
-            # A more robust solution would manage conversation history.
-            # The `generate_content(stream=True)` expects an iterable of `Part` or `Content` objects.
-            # We'll create a generator that yields content from our `media_out_queue`.
-            async def content_generator():
-                while self.is_running or not self.media_out_queue.empty():
-                    try:
-                        item = await asyncio.wait_for(self.media_out_queue.get(), timeout=0.1)
-                        if item is None: # Sentinel
-                            break
-                        content_parts = []
-                        if "text" in item:
-                            content_parts.append(types.Part.from_text(item["text"]))
-                        elif item["mime_type"].startswith("image/"):
-                            content_parts.append(types.Part.from_data(
-                                data=base64.b64decode(item["data"]),
-                                mime_type=item["mime_type"]
-                            ))
-                        elif item["mime_type"] == "audio/pcm":
-                             # For audio, it's better to send it as part of the overall content
-                             # The API expects audio data directly.
-                            content_parts.append(types.Part.from_data(
-                                data=item["data"],
-                                mime_type=item["mime_type"] # or audio/wav if converted
-                            ))
-                        if content_parts:
-                            # print(f"Sending content to Gemini: {content_parts}")
-                            yield types.Content(parts=content_parts, role="user") # Each item from queue is a new user turn
-                        self.media_out_queue.task_done()
-                    except asyncio.TimeoutError:
-                        continue # No new media, continue checking
-                    except Exception as e:
-                        print(f"Error in content_generator: {e}")
-                        if self.on_error: self.on_error(f"Content gen error: {e}")
-                        break
-                print("Content generator finished.")
-            # Configuration for audio output (if model supports it directly)
-            # This is a bit different from the old LiveConnectConfig
-            generation_config = types.GenerationConfig(
-                # candidate_count=1, # default
-                # stop_sequences=[],
-                # max_output_tokens=2048, # default
-                # temperature=0.9, # default
-                response_mime_type="audio/pcm", # Request audio output
-                response_schema=types.Schema(
-                    type=types.Type.OBJECT,
-                    properties={
-                        'audio_data': types.Schema(type=types.Type.STRING, format="byte", description="The audio data in PCM format."),
-                        'text_response': types.Schema(type=types.Type.STRING, description="The textual part of the response.")
-                    }
-                )
-            )
-            # Note: The above response_schema is an example. The actual way to get audio
-            # might be simpler if the model directly outputs it when asked.
-            # For "Puck" voice, it was part of LiveConnect. With GenAI API, it's more complex.
-            # Let's assume the model can provide audio if `response_mime_type="audio/pcm"` is set
-            # and the model supports it. If not, we'd need a separate TTS step.
-            # For now, we'll primarily focus on text responses and playing them if audio is somehow provided.
-            # The `stream=True` with `generate_content` is for streaming *responses*.
-            # For streaming *requests*, the input to `generate_content` should be an iterable.
-            # This is a conceptual challenge: `generate_content` is typically called once per "turn".
-            # To have a continuous stream of input media, we might need to use the lower-level
-            # `chat_session` or structure this differently.
-            # For simplicity, let's assume we send a batch of media from the queue as one turn.
-            # Let's re-think: The original code used `client.aio.live.connect` which is for a persistent session.
-            # `GenerativeModel.generate_content` is more for request-response, even if streamed.
-            # To replicate the live feel, we might need to send messages in a loop.
-            # For now, let's simplify: `_process_gemini_responses` will handle one "turn" when `send_text_input` is called.
-            # The background audio/video will be collected and sent with that text.
-            # This is a deviation from the original continuous stream but fits `generate_content` better.
-            # A true continuous bi-directional stream might require the (now less common) Live API or a different approach.
-            # Let's revert to a model closer to the original `session.receive()` if possible,
-            # or adapt to how `generate_content(stream=True)` works for responses.
-            # The `stream=True` in `generate_content` means the *response* is streamed.
-            # We need to send our `media_out_queue` items as part of the *request*.
-            # This part is tricky with the standard GenAI Python SDK for a "live" feel.
-            # The original code used a specific `live.connect` endpoint.
-            # If we stick to `GenerativeModel`, we'd typically do:
-            # model.start_chat()
-            # response = chat.send_message(..., stream=True)
-            # This is still turn-based.
-            # Let's assume the goal is to send a collection of media (text, last audio, last image)
-            # and get a streamed response.
-            # This method will now be triggered by `send_text_input`.
-            # The `media_out_queue` will be drained to form the content for `send_message`.
-            # This is a significant change from the original's continuous background sending.
-            pass # This method will be effectively merged into `send_text_input` logic for now.
-    async def start(self):
-        if self.is_running:
-            print("Session already running.")
-            return
-        print("Starting Gemini streaming client...")
-        self.is_running = True
-        self.tasks.append(asyncio.create_task(self._listen_microphone()))
-        if self.video_mode != VIDEO_MODE_NONE:
-            self.tasks.append(asyncio.create_task(self._stream_visual_media()))
-        self.tasks.append(asyncio.create_task(self._play_gemini_audio()))
-        # self.tasks.append(asyncio.create_task(self._process_gemini_responses())) # Now handled differently
-        print(f"Client started with video mode: {self.video_mode}. Tasks: {len(self.tasks)}")
-    async def stop(self):
-        if not self.is_running:
-            print("Session not running.")
-            return
-        print("Stopping Gemini streaming client...")
-        self.is_running = False
-        # Signal media processing loops to stop
-        if self.video_mode != VIDEO_MODE_NONE and self.capture_device:
-             if self.video_mode == VIDEO_MODE_CAMERA: # Only release if it's cv2.VideoCapture
-                if self.capture_device.isOpened():
-                    self.capture_device.release()
-                self.capture_device = None
-        if self.mic_stream:
-            self.mic_stream.stop_stream()
-            self.mic_stream.close()
-            self.mic_stream = None
-        await self.media_out_queue.put(None) # Sentinel for content generator if it were still separate
-        await self.audio_in_queue.put(None) # Sentinel for audio player
-        # Cancel and await tasks
-        for task in self.tasks:
-            task.cancel()
-        await asyncio.gather(*self.tasks, return_exceptions=True)
-        self.tasks = []
-        if self.speaker_stream:
-            self.speaker_stream.stop_stream()
-            self.speaker_stream.close()
-            self.speaker_stream = None
-        self.pya.terminate()
-        print("Client stopped.")
-    async def send_text_input(self, text: str):
-        if not self.is_running:
-            if self.on_error: self.on_error("Session not active. Cannot send text.")
-            return
-        print(f"Preparing to send text: {text}")
-        # Collect current media from the queue to send along with text
-        # This is a simplified "turn" based approach using generate_content
-        content_parts = [types.Part.from_text(text)]
-        # Add latest audio and video from queue if available
-        # This is a bit naive, ideally we'd have a more robust way to sync.
-        # For this example, we'll just take what's there.
-        temp_media_holder = []
-        while not self.media_out_queue.empty():
-            try:
-                media_item = self.media_out_queue.get_nowait()
-                if media_item is None: # Sentinel
-                    await self.media_out_queue.put(None) # Put it back if it was for shutdown
                     break
-                temp_media_holder.append(media_item)
-                self.media_out_queue.task_done()
-            except asyncio.QueueEmpty:
-                break
-        # Prioritize one image and some recent audio
-        last_image_part = None
-        audio_parts = []
-        for item in reversed(temp_media_holder): # Process most recent first
-            if item["mime_type"].startswith("image/") and not last_image_part:
-                last_image_part = types.Part.from_data(
-                    data=base64.b64decode(item["data"]),
-                    mime_type=item["mime_type"]
-                )
-            elif item["mime_type"] == "audio/pcm" and len(audio_parts) < 5: # Limit audio segments
-                # The API expects raw bytes for audio/pcm
-                audio_parts.append(types.Part.from_data(data=item["data"], mime_type=item["mime_type"]))
-        if last_image_part:
-            content_parts.append(last_image_part)
-        content_parts.extend(reversed(audio_parts)) # Add audio in chronological order
-        # Re-queue any unused media items (not ideal, but simple for now)
-        # for item in temp_media_holder:
-        #     if (item["mime_type"].startswith("image/") and item != last_image_part_source) or \
-        #        (item["mime_type"] == "audio/pcm" and item not in audio_parts_sources):
-        #         await self.media_out_queue.put(item)
-        print(f"Sending to Gemini with {len(content_parts)-1} media parts.")
-        try:
-            # For Gemini 1.5 Flash, audio output is not directly part of generate_content's response_mime_type in the same way.
-            # We request text and if the model *could* generate audio, it would be in a specific field or require a different API.
-            # The original code used a "live" API which had built-in TTS.
-            # Here, we'll focus on getting text back. If audio is part of the response, we'll try to play it.
-            response_stream = await self.genai_client.generate_content_async(
-                contents=content_parts, # Send as a list of parts for the current turn
-                stream=True,
-                # generation_config=types.GenerationConfig(response_mime_type="multipart/alternative") # To get both text and potentially audio
-            )
-            full_response_text = ""
-            async for chunk in response_stream:
-                if chunk.parts:
-                    for part in chunk.parts:
-                        if part.text:
-                            # print(part.text, end="", flush=True) # Stream text to console
-                            full_response_text += part.text
-                            if self.on_text_received: # Callback for Gradio UI
-                                # Send incremental text for streaming display
-                                await self.on_text_received(part.text, is_final=False)
-                        # Check for audio data - this part is speculative for generate_content
-                        # as direct audio output like "Puck" voice was specific to LiveConnect
-                        # If the model returns audio, it would likely be in `part.data` or `part.audio_data`
-                        # For example, if `response_mime_type="audio/pcm"` worked and returned raw bytes:
-                        if hasattr(part, 'data') and part.mime_type and part.mime_type.startswith("audio/"):
-                            print(f"Received audio chunk of type {part.mime_type}")
-                            await self.audio_in_queue.put(part.data)
-                        elif hasattr(part, 'inline_data') and part.inline_data.mime_type.startswith("audio/"):
-                            # This is how function calling results with audio might look
-                            print(f"Received inline audio chunk of type {part.inline_data.mime_type}")
-                            await self.audio_in_queue.put(part.inline_data.data)
-            if self.on_text_received and full_response_text: # Send final accumulated text
-                await self.on_text_received(full_response_text, is_final=True)
-            # print() # Newline after streaming full response
-        except Exception as e:
-            print(f"Error during Gemini communication: {e}")
-            traceback.print_exc()
-            if self.on_error: self.on_error(f"Gemini API error: {e}")
-            if self.on_text_received: # Clear any partial text
-                await self.on_text_received(f"Error: {e}", is_final=True)
-# --- Gradio UI ---
-async def build_gradio_app():
-    # Gradio State
-    chat_history_var = gr.State([])
-    client_session_var = gr.State(None)
-    current_bot_message_var = gr.State("") # To accumulate streaming response
-    async def handle_text_input(text_input, chat_history, client_session, current_bot_message):
-        if not client_session:
-            gr.Warning("Session not started. Please start the session first.")
-            return chat_history, "", current_bot_message # No change to text input
-        # Add user message to chat
-        chat_history.append({"role": "user", "content": text_input})
-        # Clear current bot message accumulator before new response
-        current_bot_message = ""
-        # Send text to Gemini client (which will also pick up queued media)
-        # The client will use callbacks to update the chat_history for the bot's response
-        asyncio.create_task(client_session.send_text_input(text_input))
-        # Return updated history (user part) and clear input box
-        # Bot response will be added via callback
-        return chat_history, "", current_bot_message
-    async def update_chatbot_display(text_chunk, is_final, chat_history, current_bot_message):
-        if not chat_history: # Should not happen if user message was added
-             chat_history.append({"role": "assistant", "content": ""})
-        if is_final:
-            # If it's the final message, ensure the last entry is the complete bot message
-            if chat_history and chat_history[-1]["role"] == "assistant":
-                chat_history[-1]["content"] = current_bot_message + text_chunk # Append final chunk
-            else: # Should not happen if streaming correctly
-                chat_history.append({"role": "assistant", "content": current_bot_message + text_chunk})
-            current_bot_message = "" # Reset accumulator
         else:
-            # Append chunk to accumulator
-            current_bot_message += text_chunk
-            # Update the last message in chat_history if it's from assistant, or add new
-            if chat_history and chat_history[-1]["role"] == "assistant":
-                chat_history[-1]["content"] = current_bot_message
-            else:
-                # This case handles the very first chunk of a bot's response
-                chat_history.append({"role": "assistant", "content": current_bot_message})
-        # Convert to Gradio chatbot format
-        gradio_chat_tuples = []
-        for msg in chat_history:
-            if msg["role"] == "user":
-                gradio_chat_tuples.append((msg["content"], None))
-            elif msg["role"] == "assistant":
-                # Ensure the last bot message is updated
-                if gradio_chat_tuples and gradio_chat_tuples[-1][1] is None : # Last was user message
-                     gradio_chat_tuples.append((None, msg["content"]))
-                elif gradio_chat_tuples: # Last was bot message, update it
-                    user_q = gradio_chat_tuples[-1][0]
-                    gradio_chat_tuples[-1] = (user_q, msg["content"])
-                else: # First message is from bot (e.g. greeting)
-                    gradio_chat_tuples.append((None, msg["content"]))
-        return gradio_chat_tuples, chat_history, current_bot_message
-    def play_response_audio_sync(audio_chunk_bytes):
-        # This function will be called by the client's on_audio_received callback
-        # It needs to return something Gradio's Audio component can play.
-        # The audio component expects a filepath or (sample_rate, numpy_array)
-        # For raw PCM bytes, we need to provide sample_rate.
-        if audio_chunk_bytes:
-            # Assuming audio_chunk_bytes is raw PCM data
-            # We need to convert it to a NumPy array if Gradio's Audio component requires it
-            # For simplicity, if Gradio can handle (sample_rate, bytes) that's easier.
-            # Let's assume it needs a file path or (sr, np.array)
-            # For now, let's just yield the sample rate and the raw bytes,
-            # hoping Gradio's audio output can handle it or we adjust later.
-            # This is tricky because Gradio's audio output is not designed for continuous raw PCM streaming easily.
-            # A common pattern is to save to a temporary WAV file.
-            # For now, this callback is more of a placeholder as direct PCM streaming to gr.Audio is non-trivial.
-            # The GeminiStreamingClient already has its own _play_gemini_audio via PyAudio.
-            # So, this Gradio audio output might be redundant if PyAudio handles playback.
-            # If we want Gradio to play it, we'd do:
-            # import soundfile as sf
-            # import numpy as np
-            # temp_wav = io.BytesIO()
-            # # Assuming audio_chunk_bytes is a complete segment
-            # # sf.write(temp_wav, np.frombuffer(audio_chunk_bytes, dtype=np.int16), RECEIVE_SAMPLE_RATE, format='RAW', subtype='PCM_16')
-            # # temp_wav.seek(0)
-            # # return (RECEIVE_SAMPLE_RATE, np.frombuffer(audio_chunk_bytes, dtype=np.int16)) # This is one way
-            # return temp_wav.name # Or save to a temp file and return path
-            print(f"Gradio received audio chunk of {len(audio_chunk_bytes)} bytes. PyAudio should be playing it.")
-            return None # Let PyAudio handle it.
-        return None
-    async def on_gemini_text_received(text_chunk, is_final):
-        # This is an async wrapper to call the sync update_chatbot_display
-        # It's needed because client callbacks might be async.
-        # Gradio's .change/.submit typically calls async functions directly.
-        # Here, the callback originates from the client, not a Gradio event.
-        # This is a bit of a workaround. Ideally, Gradio would handle this update.
-        # We need to update the Gradio state. This is tricky from an external async callback.
-        # The most straightforward way is to have the Gradio interface poll or use a queue.
-        # Or, make the Gradio functions that update state callable from here.
-        # For now, we'll print and rely on the user seeing console output for streaming text.
-        # The final text will update the chatbot.
-        # A more advanced Gradio setup would use `gr.Textbox.stream` or similar.
-        # This callback will be passed to the client.
-        # It needs to update `chat_history_var` and `current_bot_message_var`
-        # and then trigger an update of the `chatbot_display`.
-        # This is complex because Gradio's state updates are tied to its event loop.
-        # Let's simplify: the callback will update a shared queue, and Gradio will poll it.
-        # Or, for this example, let the callback directly try to update,
-        # understanding it might have issues if not on Gradio's main thread.
-        # The `update_chatbot_display` will be the target.
-        # This is where the refactor gets tricky with Gradio's model.
-        # The `update_chatbot_display` function is designed to be a Gradio output function.
-        # We can't easily call it directly with new state.
-        # Alternative: The `on_text_received` callback in the client will update `chat_history_var`
-        # and `current_bot_message_var` directly. Then, we need a way to make Gradio re-render
-        # the chatbot. This is often done by having the function that *triggers* the action
-        # also return the updated chatbot state.
-        # For now, let's make the callback simple:
-        # It will print, and we'll handle the final update in `handle_text_input` response.
-        # This means no live streaming text in Gradio UI, only final response.
-        # To get live streaming, `handle_text_input` would need to yield updates.
-        # Let's try to make `update_chatbot_display` usable as a callback target
-        # by having it update the state variables.
-        # This is a conceptual placeholder. The actual update will be managed
-        # by how `handle_text_input` is structured if it were to support streaming yields.
-        # For now, the client's `on_text_received` will be simpler.
-        pass
-    async def start_stop_session(action, video_mode, current_client_session, current_chat_history, current_bot_msg):
-        if action == "Start Session":
-            if current_client_session:
-                gr.Info("Session already active.")
-                return "Stop Session", current_client_session, current_chat_history, current_bot_msg, gr.Button(interactive=True)
-            gr.Info(f"Starting session with mode: {video_mode}...")
-            # This list will store text chunks for the chatbot
-            # It will be updated by the callback
-            _chat_history_accumulator = list(current_chat_history) # Make a mutable copy
-            _current_bot_message_accumulator = str(current_bot_msg)
-            async def ui_on_text_received(text_chunk, is_final):
-                nonlocal _current_bot_message_accumulator # Allow modification
-                # This callback is tricky because it needs to update Gradio UI elements
-                # which are typically updated by returning values from event handlers.
-                # For streaming, event handlers can be generators (yield).
-                # Here, the text comes from a background task.
-                print(f"UI_TEXT_CB: {text_chunk[:50]}... (final: {is_final})")
-                if is_final:
-                    if _chat_history_accumulator and _chat_history_accumulator[-1]["role"] == "assistant":
-                         _chat_history_accumulator[-1]["content"] = _current_bot_message_accumulator + text_chunk
-                    else:
-                         _chat_history_accumulator.append({"role": "assistant", "content": _current_bot_message_accumulator + text_chunk})
-                    _current_bot_message_accumulator = ""
-                else:
-                    _current_bot_message_accumulator += text_chunk
-                    if _chat_history_accumulator and _chat_history_accumulator[-1]["role"] == "assistant":
-                        _chat_history_accumulator[-1]["content"] = _current_bot_message_accumulator
-                    elif not _chat_history_accumulator or _chat_history_accumulator[-1]["role"] == "user":
-                         _chat_history_accumulator.append({"role": "assistant", "content": _current_bot_message_accumulator})
-                # This callback doesn't directly return to Gradio to update the UI.
-                # The UI update for chatbot will happen when `handle_text_input` completes
-                # or if `start_stop_session` could yield updates (it can't easily for background events).
-                # This is a common challenge with Gradio and background tasks updating UI.
-                # For now, `_chat_history_accumulator` is updated, and `handle_text_input` will use it.
-            def ui_on_audio_received(audio_chunk):
-                # PyAudio in the client handles playback. This is for potential Gradio audio out.
-                # print(f"UI_AUDIO_CB: Received audio chunk of {len(audio_chunk)} bytes.")
-                # This would yield to gr.Audio if we used it for playback.
-                pass # Let PyAudio in client handle playback
-            def ui_on_error(error_msg):
-                gr.Error(f"Session Error: {error_msg}")
-                # Potentially try to stop the session here or update UI state
-                print(f"UI_ERROR_CB: {error_msg}")
-            client = GeminiStreamingClient(
-                video_mode=video_mode,
-                on_text_received=ui_on_text_received, # This callback needs to update Gradio state
-                on_audio_received=ui_on_audio_received,
-                on_error=ui_on_error
             )
-            await client.start()
-            gr.Info("Session started.")
-            # Update chat history state variable for other functions to use
-            # This is indirect. The callback `ui_on_text_received` should ideally update the gr.State
-            # For now, we pass the accumulator list.
-            return "Stop Session", client, _chat_history_accumulator, _current_bot_message_accumulator, gr.Button(interactive=True)
-        elif action == "Stop Session":
-            if current_client_session:
-                gr.Info("Stopping session...")
-                await current_client_session.stop()
-                gr.Info("Session stopped.")
-                return "Start Session", None, current_chat_history, "", gr.Button(interactive=True) # Clear client, keep history
-            gr.Info("No active session to stop.")
-            return "Start Session", None, current_chat_history, "", gr.Button(interactive=True)
-    with gr.Blocks(theme=gr.themes.Soft()) as app:
-        gr.Markdown("# Gemini Live Streaming Chat")
-        gr.Markdown(f"Using Model: `{MODEL_NAME}`. Ensure your `GEMINI_API_KEY` is set.")
-        with gr.Row():
-            video_mode_dropdown = gr.Dropdown(
-                choices=[VIDEO_MODE_CAMERA, VIDEO_MODE_SCREEN, VIDEO_MODE_NONE],
-                value=DEFAULT_VIDEO_MODE,
-                label="Video/Screen Input Mode",
-                interactive=True
             )
-            start_stop_button = gr.Button("Start Session")
-        chatbot_display = gr.Chatbot(
-            label="Conversation",
-            bubble_full_width=False,
-            height=600
-        )
-        # This audio output is for if Gemini sends audio that Gradio should play.
-        # Our client plays it via PyAudio, so this might be redundant or for different use.
-        # audio_output_display = gr.Audio(label="Gemini Response Audio", autoplay=True, streaming=True, interactive=False)
-        text_input_box = gr.Textbox(
-            label="Send a message",
-            placeholder="Type your message here or just talk (if mic is active)...",
-            interactive=True
-        )
-        submit_button = gr.Button("Send", interactive=False) # Disabled until session starts
-        # --- Event Handlers ---
-        def start_stop_button_update(client_session):
-            # Enable/disable text input based on session state
-            return gr.Textbox(interactive=bool(client_session)), gr.Button(interactive=bool(client_session))
-        start_stop_button.click(
-            fn=start_stop_session,
-            inputs=[start_stop_button, video_mode_dropdown, client_session_var, chat_history_var, current_bot_message_var],
-            outputs=[start_stop_button, client_session_var, chat_history_var, current_bot_message_var, text_input_box, submit_button] # Also update interactivity of text input
-        ).then(
-            fn=start_stop_button_update,
-            inputs=[client_session_var],
-            outputs=[text_input_box, submit_button]
-        )
-        # When text is submitted (via Enter or Send button)
-        # This is where the main interaction logic for text input happens.
-        # It needs to be a generator to stream responses to the chatbot.
-        async def process_and_stream_text(text_input_val, chat_history_list, client_session_obj, current_bot_msg_val):
-            if not client_session_obj:
-                gr.Warning("Session not started.")
-                # Yield current state to avoid clearing input if session not started
-                gradio_chat_tuples = []
-                for msg in chat_history_list: # Convert history to display format
-                    gradio_chat_tuples.append((msg.get("content") if msg.get("role")=="user" else None,
-                                               msg.get("content") if msg.get("role")=="assistant" else None))
-                yield gradio_chat_tuples, chat_history_list, text_input_val, current_bot_msg_val
-                return
-            # Add user message to chat history state
-            chat_history_list.append({"role": "user", "content": text_input_val})
-            # Convert to Gradio display format
-            gradio_chat_tuples = []
-            for msg in chat_history_list:
-                gradio_chat_tuples.append((msg.get("content") if msg.get("role")=="user" else None,
-                                           msg.get("content") if msg.get("role")=="assistant" else None))
-            # Yield user message immediately
-            yield gradio_chat_tuples, chat_history_list, "", current_bot_msg_val # Clear input box
-            # Prepare for bot's response (streaming)
-            # The client's on_text_received callback will update chat_history_list and current_bot_msg_val
-            # We need to make sure this function can access those updates.
-            # The `ui_on_text_received` callback in `start_stop_session` updates `_chat_history_accumulator`
-            # which is then assigned to `chat_history_var`.
-            # This is the tricky part: how to make this generator aware of updates from the background callback?
-            # One way: the callback sets a flag or puts data in a queue that this generator polls.
-            # Let's redefine the client's text callback for this specific Gradio streaming context
-            # This means the `GeminiStreamingClient` needs to be flexible with its callback.
-            # For now, let's assume `client_session_obj.send_text_input` will trigger the callback,
-            # and the callback updates `chat_history_list` and `current_bot_msg_val` (passed by reference).
-            # The `ui_on_text_received` callback (defined within `start_stop_session`)
-            # is already designed to modify `_chat_history_accumulator` and `_current_bot_message_accumulator`.
-            # When `start_stop_session` returns, these are put into `chat_history_var` and `current_bot_message_var`.
-            # So, `chat_history_list` and `current_bot_msg_val` here *should* be the updated ones.
-            # Trigger Gemini
-            # The actual streaming of Gemini's response to the UI will be handled by how `send_text_input`
-            # and its callbacks are set up. If `ui_on_text_received` can trigger a re-yield here, that's ideal.
-            # Gradio's streaming usually involves the event handler itself yielding multiple times.
-            # Let's simplify: `send_text_input` is fire-and-forget here.
-            # The `ui_on_text_received` callback will be responsible for updating the shared state.
-            # This generator needs to periodically check that shared state and yield.
-            # This is a simplified approach: we send the text, then we assume the callback
-            # `ui_on_text_received` (which is set up when the session starts) will update
-            # `chat_history_list` (which is `chat_history_var`).
-            # We then just need to yield the final state of `chat_history_list`.
-            # This won't give live character-by-character streaming in the UI from this function alone.
-            # To achieve true streaming in Gradio UI from a background task:
-            # 1. Background task (Gemini client) puts response chunks into an asyncio.Queue.
-            # 2. This Gradio event handler (`process_and_stream_text`) reads from that queue and yields.
-            # Let's modify `GeminiStreamingClient` to accept a queue for text output.
-            # For now, let's stick to the callback updating the shared `chat_history_list`.
-            # The `ui_on_text_received` callback needs to be robust.
-            # Send text to client. The callback `ui_on_text_received` (configured during session start)
-            # will update `chat_history_list` and `current_bot_msg_val` "in the background".
-            await client_session_obj.send_text_input(text_input_val)
-            # After `send_text_input` completes (which means Gemini finished responding),
-            # `chat_history_list` should contain the full conversation.
-            # The `ui_on_text_received` callback should have populated it.
-            final_gradio_tuples = []
-            for msg in chat_history_list: # chat_history_list is chat_history_var's value
-                final_gradio_tuples.append((msg.get("content") if msg.get("role")=="user" else None,
-                                            msg.get("content") if msg.get("role")=="assistant" else None))
-            # Yield the final state
-            yield final_gradio_tuples, chat_history_list, "", "" # Clear bot message accumulator too
-        text_input_box.submit(
-            fn=process_and_stream_text,
-            inputs=[text_input_box, chat_history_var, client_session_var, current_bot_message_var],
-            outputs=[chatbot_display, chat_history_var, text_input_box, current_bot_message_var]
-        )
-        submit_button.click(
-            fn=process_and_stream_text,
-            inputs=[text_input_box, chat_history_var, client_session_var, current_bot_message_var],
-            outputs=[chatbot_display, chat_history_var, text_input_box, current_bot_message_var]
         )
-        # Graceful shutdown
-        async def on_close():
-            print("Gradio app is closing. Stopping client session if active.")
-            client_session = client_session_var.value # How to get state here? This is tricky.
-                                                      # Gradio doesn't have a clean "on_shutdown" hook
-                                                      # that easily accesses gr.State values from Python.
-                                                      # This would typically be handled by the user clicking "Stop Session".
-            # For now, this is a placeholder. Proper cleanup requires careful state management.
-            # A better way is to ensure the user stops the session via the button.
-            # If `client_session_var` could be accessed here, we'd do:
-            # if client_session_var.value:
-            #    await client_session_var.value.stop()
-            print("Cleanup logic in on_close needs robust state access or manual stop.")
-        # app.unload(on_close) # .unload() is for cleanup, but accessing state is hard.
-    return app
-# --- Main Execution ---
 if __name__ == "__main__":
-    if not GEMINI_API_KEY:
-        print("Error: GEMINI_API_KEY environment variable not set.")
-        print("Please set it before running the application.")
     else:
-        print("GEMINI_API_KEY found.")
-        gradio_app = asyncio.run(build_gradio_app())
-        gradio_app.queue() # Enable queuing for handling multiple users or long processes
-        gradio_app.launch(debug=True) # Share=True for public link

 import asyncio
 import base64
+import os
+import time
+from io import BytesIO
 import gradio as gr
+import numpy as np
+import websockets
+from dotenv import load_dotenv
+from fastrtc import (
+    AsyncAudioVideoStreamHandler,
+    Stream,
+    WebRTC,
+    get_cloudflare_turn_credentials_async,
+    wait_for_item,
+)
 from google import genai
+from gradio.utils import get_space
+from PIL import Image
+load_dotenv()
+def encode_audio(data: np.ndarray) -> dict:
+    """Encode Audio data to send to the server"""
+    return {
+        "mime_type": "audio/pcm",
+        "data": base64.b64encode(data.tobytes()).decode("UTF-8"),
+    }
+def encode_image(data: np.ndarray) -> dict:
+    with BytesIO() as output_bytes:
+        pil_image = Image.fromarray(data)
+        pil_image.save(output_bytes, "JPEG")
+        bytes_data = output_bytes.getvalue()
+    base64_str = str(base64.b64encode(bytes_data), "utf-8")
+    return {"mime_type": "image/jpeg", "data": base64_str}
+class GeminiHandler(AsyncAudioVideoStreamHandler):
+    def __init__(
+        self,
+    ) -> None:
+        super().__init__(
+            "mono",
+            output_sample_rate=24000,
+            input_sample_rate=16000,
         )
+        self.audio_queue = asyncio.Queue()
+        self.video_queue = asyncio.Queue()
+        self.session = None
+        self.last_frame_time = 0
+        self.quit = asyncio.Event()
+    def copy(self) -> "GeminiHandler":
+        return GeminiHandler()
+    async def start_up(self):
+        client = genai.Client(
+            api_key=os.getenv("GEMINI_API_KEY"), http_options={"api_version": "v1alpha"}
+        )
+        config = {"response_modalities": ["AUDIO"]}
+        async with client.aio.live.connect(
+            model="gemini-2.0-flash-exp",
+            config=config,  # type: ignore
+        ) as session:
+            self.session = session
+            while not self.quit.is_set():
+                turn = self.session.receive()
+                try:
+                    async for response in turn:
+                        if data := response.data:
+                            audio = np.frombuffer(data, dtype=np.int16).reshape(1, -1)
+                        self.audio_queue.put_nowait(audio)
+                except websockets.exceptions.ConnectionClosedOK:
+                    print("connection closed")
                     break
+    async def video_receive(self, frame: np.ndarray):
+        self.video_queue.put_nowait(frame)
+        if self.session:
+            # send image every 1 second
+            print(time.time() - self.last_frame_time)
+            if time.time() - self.last_frame_time > 1:
+                self.last_frame_time = time.time()
+                await self.session.send(input=encode_image(frame))
+                if self.latest_args[1] is not None:
+                    await self.session.send(input=encode_image(self.latest_args[1]))
+    async def video_emit(self):
+        frame = await wait_for_item(self.video_queue, 0.01)
+        if frame is not None:
+            return frame
         else:
+            return np.zeros((100, 100, 3), dtype=np.uint8)
+    async def receive(self, frame: tuple[int, np.ndarray]) -> None:
+        _, array = frame
+        array = array.squeeze()
+        audio_message = encode_audio(array)
+        if self.session:
+            await self.session.send(input=audio_message)
+    async def emit(self):
+        array = await wait_for_item(self.audio_queue, 0.01)
+        if array is not None:
+            return (self.output_sample_rate, array)
+        return array
+    async def shutdown(self) -> None:
+        if self.session:
+            self.quit.set()
+            await self.session.close()
+            self.quit.clear()
+stream = Stream(
+    handler=GeminiHandler(),
+    modality="audio-video",
+    mode="send-receive",
+    rtc_configuration=get_cloudflare_turn_credentials_async,
+    time_limit=180 if get_space() else None,
+    additional_inputs=[
+        gr.Image(label="Image", type="numpy", sources=["upload", "clipboard"])
+    ],
+    ui_args={
+        "icon": "https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png",
+        "pulse_color": "rgb(255, 255, 255)",
+        "icon_button_color": "rgb(255, 255, 255)",
+        "title": "Gemini Audio Video Chat",
+    },
+)
+css = """
+#video-source {max-width: 600px !important; max-height: 600 !important;}
+"""
+with gr.Blocks(css=css) as demo:
+    gr.HTML(
+        """
+    <div style='display: flex; align-items: center; justify-content: center; gap: 20px'>
+        <div style="background-color: var(--block-background-fill); border-radius: 8px">
+            <img src="https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png" style="width: 100px; height: 100px;">
+        </div>
+        <div>
+            <h1>Gen AI SDK Voice Chat</h1>
+            <p>Speak with Gemini using real-time audio + video streaming</p>
+            <p>Powered by <a href="https://gradio.app/">Gradio</a> and <a href=https://freddyaboulton.github.io/gradio-webrtc/">WebRTC</a>⚡️</p>
+            <p>Get an API Key <a href="https://support.google.com/googleapi/answer/6158862?hl=en">here</a></p>
+        </div>
+    </div>
+    """
+    )
+    with gr.Row() as row:
+        with gr.Column():
+            webrtc = WebRTC(
+                label="Video Chat",
+                modality="audio-video",
+                mode="send-receive",
+                elem_id="video-source",
+                rtc_configuration=get_cloudflare_turn_credentials_async,
+                icon="https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png",
+                pulse_color="rgb(255, 255, 255)",
+                icon_button_color="rgb(255, 255, 255)",
             )
+        with gr.Column():
+            image_input = gr.Image(
+                label="Image", type="numpy", sources=["upload", "clipboard"]
             )
+        webrtc.stream(
+            GeminiHandler(),
+            inputs=[webrtc, image_input],
+            outputs=[webrtc],
+            time_limit=180 if get_space() else None,
+            concurrency_limit=2 if get_space() else None,
         )
+stream.ui = demo
 if __name__ == "__main__":
+    if (mode := os.getenv("MODE")) == "UI":
+        stream.ui.launch(server_port=7860)
+    elif mode == "PHONE":
+        raise ValueError("Phone mode not supported for this demo")
     else:
+        stream.ui.launch(server_port=7860)