freddyaboulton HF staff commited on
Commit
1be5829
·
verified ·
1 Parent(s): 3cfdbd1

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. README.md +9 -6
  2. app.py +232 -0
README.md CHANGED
@@ -1,12 +1,15 @@
1
  ---
2
- title: Gemini Conversation
3
- emoji: 🐨
4
- colorFrom: yellow
5
- colorTo: purple
6
  sdk: gradio
7
- sdk_version: 5.19.0
8
  app_file: app.py
9
  pinned: false
 
 
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Gemini Talking to Gemini
3
+ emoji: ♊️
4
+ colorFrom: purple
5
+ colorTo: red
6
  sdk: gradio
7
+ sdk_version: 5.17.0
8
  app_file: app.py
9
  pinned: false
10
+ license: mit
11
+ short_description: Have two Gemini agents talk to each other
12
+ tags: [webrtc, websocket, gradio, secret|TWILIO_ACCOUNT_SID, secret|TWILIO_AUTH_TOKEN, secret|GEMINI_API_KEY]
13
  ---
14
 
15
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import base64
3
+ import os
4
+ from pathlib import Path
5
+ from typing import AsyncGenerator
6
+
7
+ import librosa
8
+ import numpy as np
9
+ from dotenv import load_dotenv
10
+ from fastrtc import (
11
+ AsyncStreamHandler,
12
+ Stream,
13
+ get_tts_model,
14
+ wait_for_item,
15
+ )
16
+ from fastrtc.utils import audio_to_int16
17
+ from google import genai
18
+ from google.genai.types import (
19
+ Content,
20
+ LiveConnectConfig,
21
+ Part,
22
+ PrebuiltVoiceConfig,
23
+ SpeechConfig,
24
+ VoiceConfig,
25
+ )
26
+
27
+ load_dotenv()
28
+
29
+ cur_dir = Path(__file__).parent
30
+
31
+ SAMPLE_RATE = 24000
32
+
33
+ tts_model = get_tts_model()
34
+
35
+
36
+ class GeminiHandler(AsyncStreamHandler):
37
+ """Handler for the Gemini API"""
38
+
39
+ def __init__(
40
+ self,
41
+ ) -> None:
42
+ super().__init__(
43
+ expected_layout="mono",
44
+ output_sample_rate=24000,
45
+ output_frame_size=480,
46
+ input_sample_rate=24000,
47
+ )
48
+ self.input_queue: asyncio.Queue = asyncio.Queue()
49
+ self.output_queue: asyncio.Queue = asyncio.Queue()
50
+ self.quit: asyncio.Event = asyncio.Event()
51
+
52
+ def copy(self) -> "GeminiHandler":
53
+ return GeminiHandler()
54
+
55
+ async def start_up(self):
56
+ voice_name = "Charon"
57
+ client = genai.Client(
58
+ api_key=os.getenv("GEMINI_API_KEY"),
59
+ http_options={"api_version": "v1alpha"},
60
+ )
61
+
62
+ config = LiveConnectConfig(
63
+ response_modalities=["AUDIO"], # type: ignore
64
+ speech_config=SpeechConfig(
65
+ voice_config=VoiceConfig(
66
+ prebuilt_voice_config=PrebuiltVoiceConfig(
67
+ voice_name=voice_name,
68
+ )
69
+ )
70
+ ),
71
+ system_instruction=Content(
72
+ parts=[Part(text="You are a helpful assistant.")],
73
+ role="system",
74
+ ),
75
+ )
76
+ async with client.aio.live.connect(
77
+ model="gemini-2.0-flash-exp", config=config
78
+ ) as session:
79
+ async for audio in session.start_stream(
80
+ stream=self.stream(), mime_type="audio/pcm"
81
+ ):
82
+ if audio.data:
83
+ array = np.frombuffer(audio.data, dtype=np.int16)
84
+ self.output_queue.put_nowait((self.output_sample_rate, array))
85
+
86
+ async def stream(self) -> AsyncGenerator[bytes, None]:
87
+ while not self.quit.is_set():
88
+ try:
89
+ audio = await asyncio.wait_for(self.input_queue.get(), 0.1)
90
+ yield audio
91
+ except (asyncio.TimeoutError, TimeoutError):
92
+ pass
93
+
94
+ async def receive(self, frame: tuple[int, np.ndarray]) -> None:
95
+ _, array = frame
96
+ array = array.squeeze()
97
+ audio_message = base64.b64encode(array.tobytes()).decode("UTF-8")
98
+ self.input_queue.put_nowait(audio_message)
99
+
100
+ async def emit(self) -> tuple[int, np.ndarray] | None:
101
+ return await wait_for_item(self.output_queue)
102
+
103
+ def shutdown(self) -> None:
104
+ self.quit.set()
105
+
106
+
107
+ class GeminiHandler2(GeminiHandler):
108
+ async def start_up(self):
109
+ starting_message = tts_model.tts("Can you help me make an omelette?")
110
+ starting_message = librosa.resample(
111
+ starting_message[1],
112
+ orig_sr=starting_message[0],
113
+ target_sr=self.output_sample_rate,
114
+ )
115
+ starting_message = audio_to_int16((self.output_sample_rate, starting_message))
116
+ await self.output_queue.put((self.output_sample_rate, starting_message))
117
+ voice_name = "Puck"
118
+ client = genai.Client(
119
+ api_key=os.getenv("GEMINI_API_KEY"),
120
+ http_options={"api_version": "v1alpha"},
121
+ )
122
+
123
+ config = LiveConnectConfig(
124
+ response_modalities=["AUDIO"], # type: ignore
125
+ speech_config=SpeechConfig(
126
+ voice_config=VoiceConfig(
127
+ prebuilt_voice_config=PrebuiltVoiceConfig(
128
+ voice_name=voice_name,
129
+ )
130
+ )
131
+ ),
132
+ system_instruction=Content(
133
+ parts=[
134
+ Part(
135
+ text="You are a cooking student who wants to learn how to make an omelette."
136
+ ),
137
+ Part(
138
+ text="You are currently in the kitchen with a teacher who is helping you make an omelette."
139
+ ),
140
+ Part(
141
+ text="Please wait for the teacher to tell you what to do next. Follow the teacher's instructions carefully."
142
+ ),
143
+ ],
144
+ role="system",
145
+ ),
146
+ )
147
+ async with client.aio.live.connect(
148
+ model="gemini-2.0-flash-exp", config=config
149
+ ) as session:
150
+ async for audio in session.start_stream(
151
+ stream=self.stream(), mime_type="audio/pcm"
152
+ ):
153
+ if audio.data:
154
+ array = np.frombuffer(audio.data, dtype=np.int16)
155
+ self.output_queue.put_nowait((self.output_sample_rate, array))
156
+
157
+ def copy(self) -> "GeminiHandler2":
158
+ return GeminiHandler2()
159
+
160
+
161
+ gemini_stream = Stream(
162
+ GeminiHandler(),
163
+ modality="audio",
164
+ mode="send-receive",
165
+ ui_args={
166
+ "title": "Gemini Teacher",
167
+ "icon": "https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png",
168
+ "pulse_color": "rgb(74, 138, 213)",
169
+ "icon_button_color": "rgb(255, 255, 255)",
170
+ },
171
+ )
172
+
173
+ gemini_stream_2 = Stream(
174
+ GeminiHandler2(),
175
+ modality="audio",
176
+ mode="send-receive",
177
+ ui_args={
178
+ "title": "Gemini Student",
179
+ "icon": "https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png",
180
+ "pulse_color": "rgb(132, 112, 196)",
181
+ "icon_button_color": "rgb(255, 255, 255)",
182
+ },
183
+ )
184
+
185
+ if __name__ == "__main__":
186
+ from gradio.utils import get_space
187
+ import gradio as gr
188
+
189
+ if not get_space():
190
+ with gr.Blocks() as demo:
191
+ gr.HTML(
192
+ """
193
+ <div style="display: flex; justify-content: center; align-items: center;">
194
+ <h1>Gemini Conversation</h1>
195
+ </div>
196
+ """
197
+ )
198
+ gr.Markdown(
199
+ """# How to run this demo
200
+
201
+ - Clone the repo - top right of the page click the vertical three dots and select "Clone repository"
202
+ - Open the repo in a terminal and install the dependencies
203
+ - Get a gemini API key [here](https://ai.google.dev/gemini-api/docs/api-key)
204
+ - Create a `.env` file in the root of the repo and add the following:
205
+ ```
206
+ GEMINI_API_KEY=<your_gemini_api_key>
207
+ ```
208
+ - Run the app with `python app.py`
209
+ - This will print the two URLs of the agents running locally
210
+ - Use ngrok to exponse one agent to the internet. This is so that you can acces it from your phone
211
+ - Use the ngrok URL to access the agent from your phone
212
+ - Now, start the "teacher gemini" agent first. Then, start the "student gemini" agent. The student gemini will start talking to the teacher gemini. And the teacher gemini will respond!
213
+
214
+ Important:
215
+ - Make sure the audio sources are not too close to each other or too loud. Sometimes that causes them to talk over each other..
216
+ - Feel free to modify the `system_instruction` to change the behavior of the agents.
217
+ - You can also modify the `voice_name` to change the voice of the agents.
218
+ - Have fun!
219
+ """
220
+ )
221
+ demo.launch()
222
+
223
+ import time
224
+
225
+ _ = gemini_stream.ui.launch(server_port=7860, prevent_thread_lock=True)
226
+ _ = gemini_stream_2.ui.launch(server_port=7861, prevent_thread_lock=True)
227
+ try:
228
+ while True:
229
+ time.sleep(1)
230
+ except KeyboardInterrupt:
231
+ gemini_stream.ui.close()
232
+ gemini_stream_2.ui.close()