Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -62,65 +62,34 @@ class GeminiHandler(AsyncAudioVideoStreamHandler):
|
|
62 |
api_key=os.getenv("GEMINI_API_KEY"), http_options={"api_version": "v1alpha"}
|
63 |
)
|
64 |
config = {"response_modalities": ["AUDIO"]}
|
65 |
-
|
66 |
-
# --- System Message Definition ---
|
67 |
-
# You can customize this message to set the context for the AI.
|
68 |
-
system_message = (
|
69 |
-
"du bist ein echtzeitübersetzer für deutsch auf italienisch und italienisch auf deutsch. erläre nicht, kommentiere nicht, füge nichts hinzu, nur übersetzen"
|
70 |
-
|
71 |
-
)
|
72 |
-
# --- End of System Message Definition ---
|
73 |
-
|
74 |
async with client.aio.live.connect(
|
75 |
model="gemini-2.0-flash-exp",
|
76 |
config=config, # type: ignore
|
77 |
) as session:
|
78 |
self.session = session
|
79 |
-
|
80 |
-
# --- Send the System Message ---
|
81 |
-
if system_message:
|
82 |
-
print(f"Sending system message to Gemini: '{system_message}'")
|
83 |
-
try:
|
84 |
-
# Send the system message as the first input to the model
|
85 |
-
await self.session.send(input=system_message)
|
86 |
-
# Note: The model might provide an audio response to this system message,
|
87 |
-
# which will be handled by the loop below.
|
88 |
-
# If this initial audio response is not desired, further handling might be needed,
|
89 |
-
# or the system message phrased to not elicit a direct spoken reply.
|
90 |
-
except Exception as e:
|
91 |
-
print(f"Error sending system message: {e}")
|
92 |
-
# --- End of Sending System Message ---
|
93 |
-
|
94 |
while not self.quit.is_set():
|
95 |
turn = self.session.receive()
|
96 |
try:
|
97 |
async for response in turn:
|
98 |
-
if data := response.data:
|
99 |
audio = np.frombuffer(data, dtype=np.int16).reshape(1, -1)
|
100 |
-
|
101 |
-
# If you expect other types of responses (e.g., text parts) from the model,
|
102 |
-
# you would need to handle `response.text` or other fields here.
|
103 |
except websockets.exceptions.ConnectionClosedOK:
|
104 |
-
print("
|
105 |
break
|
106 |
-
except Exception as e:
|
107 |
-
print(f"Error during session receive: {e}")
|
108 |
-
break # Or implement more robust error handling
|
109 |
|
110 |
async def video_receive(self, frame: np.ndarray):
|
111 |
self.video_queue.put_nowait(frame)
|
112 |
|
113 |
if self.session:
|
114 |
# send image every 1 second
|
115 |
-
|
116 |
if time.time() - self.last_frame_time > 1:
|
117 |
self.last_frame_time = time.time()
|
118 |
await self.session.send(input=encode_image(frame))
|
119 |
-
|
120 |
-
if len(self.latest_args) > 1 and self.latest_args[1] is not None:
|
121 |
await self.session.send(input=encode_image(self.latest_args[1]))
|
122 |
|
123 |
-
|
124 |
async def video_emit(self):
|
125 |
frame = await wait_for_item(self.video_queue, 0.01)
|
126 |
if frame is not None:
|
@@ -139,18 +108,13 @@ class GeminiHandler(AsyncAudioVideoStreamHandler):
|
|
139 |
array = await wait_for_item(self.audio_queue, 0.01)
|
140 |
if array is not None:
|
141 |
return (self.output_sample_rate, array)
|
142 |
-
return array
|
143 |
|
144 |
async def shutdown(self) -> None:
|
145 |
if self.session:
|
146 |
self.quit.set()
|
147 |
-
|
148 |
-
|
149 |
-
except Exception as e:
|
150 |
-
print(f"Error closing session: {e}")
|
151 |
-
finally:
|
152 |
-
self.session = None # Ensure session is cleared
|
153 |
-
self.quit.clear()
|
154 |
|
155 |
|
156 |
stream = Stream(
|
@@ -182,7 +146,7 @@ with gr.Blocks(css=css) as demo:
|
|
182 |
<h1>Gen AI Voice Chat</h1>
|
183 |
<p>real-time audio streaming</p>
|
184 |
</center>
|
185 |
-
</div>
|
186 |
"""
|
187 |
)
|
188 |
with gr.Row() as row:
|
@@ -197,19 +161,14 @@ with gr.Blocks(css=css) as demo:
|
|
197 |
pulse_color="rgb(255, 255, 255)",
|
198 |
icon_button_color="rgb(255, 255, 255)",
|
199 |
)
|
200 |
-
#
|
201 |
-
|
202 |
-
|
203 |
-
#
|
204 |
-
|
205 |
-
# )
|
206 |
-
|
207 |
-
# The stream method now correctly uses the handler instance from the Stream object
|
208 |
-
# and correctly wires up inputs and outputs.
|
209 |
-
# The additional_inputs (like an image) are passed via self.latest_args in the handler.
|
210 |
webrtc.stream(
|
211 |
-
|
212 |
-
inputs=[webrtc]
|
213 |
outputs=[webrtc],
|
214 |
time_limit=180 if get_space() else None,
|
215 |
concurrency_limit=2 if get_space() else None,
|
@@ -224,4 +183,4 @@ if __name__ == "__main__":
|
|
224 |
elif mode == "PHONE":
|
225 |
raise ValueError("Phone mode not supported for this demo")
|
226 |
else:
|
227 |
-
stream.ui.launch(server_port=7860)
|
|
|
62 |
api_key=os.getenv("GEMINI_API_KEY"), http_options={"api_version": "v1alpha"}
|
63 |
)
|
64 |
config = {"response_modalities": ["AUDIO"]}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
async with client.aio.live.connect(
|
66 |
model="gemini-2.0-flash-exp",
|
67 |
config=config, # type: ignore
|
68 |
) as session:
|
69 |
self.session = session
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
while not self.quit.is_set():
|
71 |
turn = self.session.receive()
|
72 |
try:
|
73 |
async for response in turn:
|
74 |
+
if data := response.data:
|
75 |
audio = np.frombuffer(data, dtype=np.int16).reshape(1, -1)
|
76 |
+
self.audio_queue.put_nowait(audio)
|
|
|
|
|
77 |
except websockets.exceptions.ConnectionClosedOK:
|
78 |
+
print("connection closed")
|
79 |
break
|
|
|
|
|
|
|
80 |
|
81 |
async def video_receive(self, frame: np.ndarray):
|
82 |
self.video_queue.put_nowait(frame)
|
83 |
|
84 |
if self.session:
|
85 |
# send image every 1 second
|
86 |
+
print(time.time() - self.last_frame_time)
|
87 |
if time.time() - self.last_frame_time > 1:
|
88 |
self.last_frame_time = time.time()
|
89 |
await self.session.send(input=encode_image(frame))
|
90 |
+
if self.latest_args[1] is not None:
|
|
|
91 |
await self.session.send(input=encode_image(self.latest_args[1]))
|
92 |
|
|
|
93 |
async def video_emit(self):
|
94 |
frame = await wait_for_item(self.video_queue, 0.01)
|
95 |
if frame is not None:
|
|
|
108 |
array = await wait_for_item(self.audio_queue, 0.01)
|
109 |
if array is not None:
|
110 |
return (self.output_sample_rate, array)
|
111 |
+
return array
|
112 |
|
113 |
async def shutdown(self) -> None:
|
114 |
if self.session:
|
115 |
self.quit.set()
|
116 |
+
await self.session.close()
|
117 |
+
self.quit.clear()
|
|
|
|
|
|
|
|
|
|
|
118 |
|
119 |
|
120 |
stream = Stream(
|
|
|
146 |
<h1>Gen AI Voice Chat</h1>
|
147 |
<p>real-time audio streaming</p>
|
148 |
</center>
|
149 |
+
</div>
|
150 |
"""
|
151 |
)
|
152 |
with gr.Row() as row:
|
|
|
161 |
pulse_color="rgb(255, 255, 255)",
|
162 |
icon_button_color="rgb(255, 255, 255)",
|
163 |
)
|
164 |
+
#with gr.Column():
|
165 |
+
#image_input = gr.Image(
|
166 |
+
#label="Image", type="numpy", sources=["upload", "clipboard"]
|
167 |
+
#)
|
168 |
+
|
|
|
|
|
|
|
|
|
|
|
169 |
webrtc.stream(
|
170 |
+
GeminiHandler(),
|
171 |
+
inputs=[webrtc],
|
172 |
outputs=[webrtc],
|
173 |
time_limit=180 if get_space() else None,
|
174 |
concurrency_limit=2 if get_space() else None,
|
|
|
183 |
elif mode == "PHONE":
|
184 |
raise ValueError("Phone mode not supported for this demo")
|
185 |
else:
|
186 |
+
stream.ui.launch(server_port=7860)
|