gemini-webrtc

Running

App Files Files Community

mgokg commited on May 15

Commit

bbf3c5f

verified ·

1 Parent(s): 5982628

Update app.py

Browse files

Files changed (1) hide show

app.py +151 -62

app.py CHANGED Viewed

@@ -3,82 +3,171 @@ import base64
 import os
 import time
 from io import BytesIO
 import numpy as np
 import websockets
-import streamlit as st
 from dotenv import load_dotenv
-from PIL import Image
 from google import genai
 load_dotenv()
-# Helper Functions
 def encode_audio(data: np.ndarray) -> dict:
-    """Encode Audio data"""
-    return {
-        "mime_type": "audio/pcm",
-        "data": base64.b64encode(data.tobytes()).decode("UTF-8"),
-    }
 def encode_image(data: np.ndarray) -> dict:
-    """Encode Image data"""
-    with BytesIO() as output_bytes:
-        pil_image = Image.fromarray(data)
-        pil_image.save(output_bytes, "JPEG")
-        bytes_data = output_bytes.getvalue()
-    return {"mime_type": "image/jpeg", "data": base64.b64encode(bytes_data).decode("utf-8")}
-# Streamlit UI
-st.title("Gen AI Voice Chat")
-st.subheader("Real-time audio & video streaming")
-# Initialize chat history
-if "messages" not in st.session_state:
-    st.session_state.messages = [
-        {"role": "assistant", "content": "Welcome! I'm your AI assistant. I can process images and audio in real-time. How can I help you today?"}
-    ]
-# Display chat messages
-for message in st.session_state.messages:
-    with st.chat_message(message["role"]):
-        st.write(message["content"])
-# Sidebar for image upload
-with st.sidebar:
-    st.header("Configuration")
-    uploaded_image = st.file_uploader("Upload an Image", type=["jpg", "png"])
-    if uploaded_image:
-        # Add user message with image
-        st.session_state.messages.append({"role": "user", "content": "Uploaded an image"})
-        # Display image in chat
-        with st.chat_message("user"):
-            st.image(uploaded_image, caption="Uploaded Image", use_column_width=True)
-# WebRTC Streaming Placeholder
-with st.expander("🎥 Live Video Stream"):
-    st.write("WebRTC video streaming placeholder - implement your video streaming here")
-# Async Audio Processing
-async def start_audio_processing():
-    client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
     async with client.aio.live.connect(
-        model="gemini-2.0-flash-exp", config={"response_modalities": ["AUDIO"]}
     ) as session:
-        while True:
-            turn = session.receive()
             try:
                 async for response in turn:
                     if data := response.data:
                         audio = np.frombuffer(data, dtype=np.int16).reshape(1, -1)
-                        # Add assistant response to chat
-                        st.session_state.messages.append({"role": "assistant", "content": audio})
-                        with st.chat_message("assistant"):
-                            st.audio(audio, format="audio/wav")
             except websockets.exceptions.ConnectionClosedOK:
-                st.error("Connection closed.")
                 break
-# Run the Streamlit App
-if __name__ == "__main__":
-    asyncio.run(start_audio_processing())

 import os
 import time
 from io import BytesIO
+import gradio as gr
 import numpy as np
 import websockets
 from dotenv import load_dotenv
+from fastrtc import (
+AsyncAudioVideoStreamHandler,
+Stream,
+WebRTC,
+get_cloudflare_turn_credentials_async,
+wait_for_item,
+)
 from google import genai
+from gradio.utils import get_space
+from PIL import Image
 load_dotenv()
 def encode_audio(data: np.ndarray) -> dict:
+"""Encode Audio data to send to the server"""
+return {
+"mime_type": "audio/pcm",
+"data": base64.b64encode(data.tobytes()).decode("UTF-8"),
+}
 def encode_image(data: np.ndarray) -> dict:
+with BytesIO() as output_bytes:
+pil_image = Image.fromarray(data)
+pil_image.save(output_bytes, "JPEG")
+bytes_data = output_bytes.getvalue()
+base64_str = str(base64.b64encode(bytes_data), "utf-8")
+return {"mime_type": "image/jpeg", "data": base64_str}
+class GeminiHandler(AsyncAudioVideoStreamHandler):
+def init(
+self,
+) -> None:
+super().init(
+"mono",
+output_sample_rate=24000,
+input_sample_rate=16000,
+)
+self.audio_queue = asyncio.Queue()
+self.video_queue = asyncio.Queue()
+self.session = None
+self.last_frame_time = 0
+self.quit = asyncio.Event()
+def copy(self) -> "GeminiHandler":
+    return GeminiHandler()
+async def start_up(self):
+    client = genai.Client(
+        api_key=os.getenv("GEMINI_API_KEY"), http_options={"api_version": "v1alpha"}
+    )
+    config = {"response_modalities": ["AUDIO"]}
     async with client.aio.live.connect(
+        model="gemini-2.0-flash-exp",
+        config=config,  # type: ignore
     ) as session:
+        self.session = session
+        while not self.quit.is_set():
+            turn = self.session.receive()
             try:
                 async for response in turn:
                     if data := response.data:
                         audio = np.frombuffer(data, dtype=np.int16).reshape(1, -1)
+                    self.audio_queue.put_nowait(audio)
             except websockets.exceptions.ConnectionClosedOK:
+                print("connection closed")
                 break
+async def video_receive(self, frame: np.ndarray):
+    self.video_queue.put_nowait(frame)
+    if self.session:
+        # send image every 1 second
+        print(time.time() - self.last_frame_time)
+        if time.time() - self.last_frame_time > 1:
+            self.last_frame_time = time.time()
+            await self.session.send(input=encode_image(frame))
+            if self.latest_args[1] is not None:
+                await self.session.send(input=encode_image(self.latest_args[1]))
+async def video_emit(self):
+    frame = await wait_for_item(self.video_queue, 0.01)
+    if frame is not None:
+        return frame
+    else:
+        return np.zeros((100, 100, 3), dtype=np.uint8)
+async def receive(self, frame: tuple[int, np.ndarray]) -> None:
+    _, array = frame
+    array = array.squeeze()
+    audio_message = encode_audio(array)
+    if self.session:
+        await self.session.send(input=audio_message)
+async def emit(self):
+    array = await wait_for_item(self.audio_queue, 0.01)
+    if array is not None:
+        return (self.output_sample_rate, array)
+    return array
+async def shutdown(self) -> None:
+    if self.session:
+        self.quit.set()
+        await self.session.close()
+        self.quit.clear()
+Use code with caution.
+stream = Stream(
+handler=GeminiHandler(),
+modality="audio-video",
+mode="send-receive",
+rtc_configuration=get_cloudflare_turn_credentials_async,
+time_limit=180 if get_space() else None,
+additional_inputs=[
+gr.Image(label="Image", type="numpy", sources=["upload", "clipboard"])
+],
+ui_args={
+"icon": "https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png",
+"pulse_color": "rgb(255, 255, 255)",
+"icon_button_color": "rgb(255, 255, 255)",
+"title": "Gemini Audio Video Chat",
+},
+)
+css = """
+#video-source {max-width: 600px !important; max-height: 600 !important;}
+"""
+with gr.Blocks(css=css) as demo:
+gr.HTML(
+"""
+<div>
+<center>
+<h1>Gen AI Voice Chat</h1>
+<p>Real-time audio + video streaming</p>
+<center>
+</div>
+"""
+)
+with gr.Row() as row:
+with gr.Column():
+webrtc = WebRTC(
+label="Video Chat",
+modality="audio-video",
+mode="send-receive",
+elem_id="video-source",
+rtc_configuration=get_cloudflare_turn_credentials_async,
+icon="https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png",
+pulse_color="rgb(255, 255, 255)",
+icon_button_color="rgb(255, 255, 255)",
+)
+#with gr.Column():
+#image_input = gr.Image(
+#label="Image", type="numpy", sources=["upload", "clipboard"]
+#)
+webrtc.stream(
+        GeminiHandler(),
+        inputs=[webrtc],
+        #inputs=[webrtc, image_input],
+        outputs=[webrtc],
+        time_limit=180 if get_space() else None,
+        concurrency_limit=2 if get_space() else None,
+    )
+Use code with caution.
+stream.ui = demo
+if name == "main":
+if (mode := os.getenv("MODE")) == "UI":
+stream.ui.launch(server_port=7860)
+elif mode == "PHONE":
+raise ValueError("Phone mode not supported for this demo")
+else:
+stream.ui.launch(server_port=7860)
+warning