SalexAI commited on
Commit
56d02e9
·
verified ·
1 Parent(s): 49384b3

Upload 2 files

Browse files
Files changed (2) hide show
  1. index.html +361 -0
  2. main.py +122 -0
index.html ADDED
@@ -0,0 +1,361 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+
4
+ <head>
5
+ <link rel="stylesheet" href="https://fonts.googleapis.com/icon?family=Material+Icons">
6
+ <link rel="stylesheet" href="https://code.getmdl.io/1.3.0/material.indigo-pink.min.css">
7
+ <script defer src="https://code.getmdl.io/1.3.0/material.min.js"></script>
8
+
9
+ <style>
10
+ #videoElement {
11
+ width: 640px;
12
+ height: 480px;
13
+ border-radius: 20px;
14
+ }
15
+
16
+ #canvasElement {
17
+ display: none;
18
+ width: 640px;
19
+ height: 480px;
20
+ }
21
+
22
+ .demo-content {
23
+ padding: 20px;
24
+ display: flex;
25
+ flex-direction: column;
26
+ align-items: center;
27
+ }
28
+
29
+ .button-group {
30
+ margin-bottom: 20px;
31
+ }
32
+ </style>
33
+ </head>
34
+
35
+ <body>
36
+ <div class="mdl-layout mdl-js-layout mdl-layout--fixed-header">
37
+ <header class="mdl-layout__header">
38
+ <div class="mdl-layout__header-row">
39
+ <!-- Title -->
40
+ <span class="mdl-layout-title">Gemini Live Demo</span>
41
+ </div>
42
+ </header>
43
+ <main class="mdl-layout__content">
44
+ <div class="page-content">
45
+ <div class="demo-content">
46
+ <!-- Voice Control Buttons -->
47
+ <div class="button-group">
48
+ <button id="startButton"
49
+ class="mdl-button mdl-js-button mdl-button--fab mdl-button--mini-fab mdl-button--colored">
50
+ <i class="material-icons">mic</i>
51
+ </button>
52
+ <button id="stopButton"
53
+ class="mdl-button mdl-js-button mdl-button--fab mdl-button--mini-fab">
54
+ <i class="material-icons">mic_off</i>
55
+ </button>
56
+ </div>
57
+
58
+ <!-- Video Element -->
59
+ <video id="videoElement" autoplay style="width: 640px; height: 480px;"></video>
60
+
61
+ <!-- Hidden Canvas -->
62
+ <canvas id="canvasElement" style="width: 640px; height: 480px;"></canvas>
63
+ <!-- Text Output -->
64
+ <div id="chatLog"></div>
65
+ </div>
66
+ </div>
67
+ </main>
68
+ </div>
69
+
70
+ <script defer>
71
+ const URL = "ws://localhost:9083";
72
+ const video = document.getElementById("videoElement");
73
+ const canvas = document.getElementById("canvasElement");
74
+ let context;
75
+
76
+ // Initialize context here
77
+ window.addEventListener("load", () => {
78
+ context = canvas.getContext("2d");
79
+ setInterval(captureImage, 3000);
80
+ });
81
+
82
+ const startButton = document.getElementById('startButton');
83
+ const stopButton = document.getElementById('stopButton');
84
+ let stream = null;
85
+ let currentFrameB64;
86
+ let webSocket = null;
87
+ let audioContext = null;
88
+ let mediaRecorder = null;
89
+ let processor = null;
90
+ let pcmData = [];
91
+ let interval = null;
92
+ let initialized = false;
93
+ let audioInputContext;
94
+ let workletNode;
95
+
96
+
97
+ // Function to start screen capture
98
+ async function startScreenShare() {
99
+ try {
100
+ stream = await navigator.mediaDevices.getDisplayMedia({
101
+ video: {
102
+ width: { max: 640 },
103
+ height: { max: 480 },
104
+ },
105
+ });
106
+
107
+ video.srcObject = stream;
108
+ await new Promise(resolve => {
109
+ video.onloadedmetadata = () => {
110
+ console.log("video loaded metadata");
111
+ resolve();
112
+ }
113
+ });
114
+
115
+ } catch (err) {
116
+ console.error("Error accessing the screen: ", err);
117
+ }
118
+ }
119
+
120
+
121
+ // Function to capture an image from the shared screen
122
+ function captureImage() {
123
+ if (stream && video.videoWidth > 0 && video.videoHeight > 0 && context) {
124
+ canvas.width = 640;
125
+ canvas.height = 480;
126
+ context.drawImage(video, 0, 0, canvas.width, canvas.height);
127
+ const imageData = canvas.toDataURL("image/jpeg").split(",")[1].trim();
128
+ currentFrameB64 = imageData;
129
+ }
130
+ else {
131
+ console.log("no stream or video metadata not loaded");
132
+ }
133
+ }
134
+
135
+
136
+
137
+ window.addEventListener("load", async () => {
138
+ await startScreenShare();
139
+ //setInterval(captureImage, 3000);
140
+
141
+ // Initialize audio context right away
142
+ await initializeAudioContext();
143
+
144
+ connect();
145
+ });
146
+
147
+ function connect() {
148
+ console.log("connecting: ", URL);
149
+
150
+ webSocket = new WebSocket(URL);
151
+
152
+ webSocket.onclose = (event) => {
153
+ console.log("websocket closed: ", event);
154
+ alert("Connection closed");
155
+ };
156
+
157
+ webSocket.onerror = (event) => {
158
+ console.log("websocket error: ", event);
159
+ };
160
+
161
+ webSocket.onopen = (event) => {
162
+ console.log("websocket open: ", event);
163
+ sendInitialSetupMessage();
164
+ };
165
+
166
+ webSocket.onmessage = receiveMessage;
167
+ }
168
+
169
+ function sendInitialSetupMessage() {
170
+
171
+ console.log("sending setup message");
172
+ setup_client_message = {
173
+ setup: {
174
+ generation_config: { response_modalities: ["AUDIO"] },
175
+ },
176
+ };
177
+
178
+ webSocket.send(JSON.stringify(setup_client_message));
179
+ }
180
+
181
+
182
+ function sendVoiceMessage(b64PCM) {
183
+ if (webSocket == null) {
184
+ console.log("websocket not initialized");
185
+ return;
186
+ }
187
+
188
+ payload = {
189
+ realtime_input: {
190
+ media_chunks: [{
191
+ mime_type: "audio/pcm",
192
+ data: b64PCM,
193
+ },
194
+ {
195
+ mime_type: "image/jpeg",
196
+ data: currentFrameB64,
197
+ },
198
+ ],
199
+ },
200
+ };
201
+
202
+ webSocket.send(JSON.stringify(payload));
203
+ console.log("sent: ", payload);
204
+ }
205
+
206
+ function receiveMessage(event) {
207
+ const messageData = JSON.parse(event.data);
208
+ const response = new Response(messageData);
209
+
210
+ if (response.text) {
211
+ displayMessage("GEMINI: " + response.text);
212
+ }
213
+ if (response.audioData) {
214
+ injestAudioChuckToPlay(response.audioData);
215
+ }
216
+ }
217
+
218
+
219
+ async function initializeAudioContext() {
220
+ if (initialized) return;
221
+
222
+ audioInputContext = new (window.AudioContext ||
223
+ window.webkitAudioContext)({
224
+ sampleRate: 24000
225
+ });
226
+ await audioInputContext.audioWorklet.addModule("pcm-processor.js");
227
+ workletNode = new AudioWorkletNode(audioInputContext, "pcm-processor");
228
+ workletNode.connect(audioInputContext.destination);
229
+ initialized = true;
230
+ }
231
+
232
+
233
+ function base64ToArrayBuffer(base64) {
234
+ const binaryString = window.atob(base64);
235
+ const bytes = new Uint8Array(binaryString.length);
236
+ for (let i = 0; i < binaryString.length; i++) {
237
+ bytes[i] = binaryString.charCodeAt(i);
238
+ }
239
+ return bytes.buffer;
240
+ }
241
+
242
+ function convertPCM16LEToFloat32(pcmData) {
243
+ const inputArray = new Int16Array(pcmData);
244
+ const float32Array = new Float32Array(inputArray.length);
245
+
246
+ for (let i = 0; i < inputArray.length; i++) {
247
+ float32Array[i] = inputArray[i] / 32768;
248
+ }
249
+
250
+ return float32Array;
251
+ }
252
+
253
+
254
+ async function injestAudioChuckToPlay(base64AudioChunk) {
255
+ try {
256
+ if (audioInputContext.state === "suspended") {
257
+ await audioInputContext.resume();
258
+ }
259
+ const arrayBuffer = base64ToArrayBuffer(base64AudioChunk);
260
+ const float32Data = convertPCM16LEToFloat32(arrayBuffer);
261
+
262
+ workletNode.port.postMessage(float32Data);
263
+ } catch (error) {
264
+ console.error("Error processing audio chunk:", error);
265
+ }
266
+ }
267
+
268
+
269
+ function recordChunk() {
270
+ const buffer = new ArrayBuffer(pcmData.length * 2);
271
+ const view = new DataView(buffer);
272
+ pcmData.forEach((value, index) => {
273
+ view.setInt16(index * 2, value, true);
274
+ });
275
+
276
+ const base64 = btoa(
277
+ String.fromCharCode.apply(null, new Uint8Array(buffer))
278
+ );
279
+
280
+ sendVoiceMessage(base64);
281
+ pcmData = [];
282
+ }
283
+
284
+ async function startAudioInput() {
285
+ audioContext = new AudioContext({
286
+ sampleRate: 16000,
287
+ });
288
+
289
+ const stream = await navigator.mediaDevices.getUserMedia({
290
+ audio: {
291
+ channelCount: 1,
292
+ sampleRate: 16000,
293
+ },
294
+ });
295
+
296
+ const source = audioContext.createMediaStreamSource(stream);
297
+ processor = audioContext.createScriptProcessor(4096, 1, 1);
298
+
299
+ processor.onaudioprocess = (e) => {
300
+ const inputData = e.inputBuffer.getChannelData(0);
301
+ const pcm16 = new Int16Array(inputData.length);
302
+ for (let i = 0; i < inputData.length; i++) {
303
+ pcm16[i] = inputData[i] * 0x7fff;
304
+ }
305
+ pcmData.push(...pcm16);
306
+ };
307
+
308
+ source.connect(processor);
309
+ processor.connect(audioContext.destination);
310
+
311
+ interval = setInterval(recordChunk, 3000);
312
+ }
313
+
314
+ function stopAudioInput() {
315
+ if (processor) {
316
+ processor.disconnect();
317
+ }
318
+ if (audioContext) {
319
+ audioContext.close();
320
+ }
321
+
322
+ clearInterval(interval);
323
+ }
324
+
325
+ function displayMessage(message) {
326
+ console.log(message);
327
+ addParagraphToDiv("chatLog", message);
328
+ }
329
+
330
+
331
+ function addParagraphToDiv(divId, text) {
332
+ const newParagraph = document.createElement("p");
333
+ newParagraph.textContent = text;
334
+ const div = document.getElementById(divId);
335
+ div.appendChild(newParagraph);
336
+ }
337
+
338
+ startButton.addEventListener('click', startAudioInput);
339
+ stopButton.addEventListener('click', stopAudioInput);
340
+
341
+
342
+ class Response {
343
+ constructor(data) {
344
+ this.text = null;
345
+ this.audioData = null;
346
+ this.endOfTurn = null;
347
+
348
+ if (data.text) {
349
+ this.text = data.text
350
+ }
351
+
352
+ if (data.audio) {
353
+ this.audioData = data.audio;
354
+ }
355
+ }
356
+ }
357
+ </script>
358
+
359
+ </body>
360
+
361
+ </html>
main.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## pip install --upgrade google-genai==0.3.0##
2
+ import asyncio
3
+ import json
4
+ import os
5
+ import websockets
6
+ from google import genai
7
+ import base64
8
+
9
+ # Load API key from environment
10
+ os.environ['GOOGLE_API_KEY'] = ''
11
+ MODEL = "gemini-2.0-flash-exp" # use your model ID
12
+
13
+ client = genai.Client(
14
+ http_options={
15
+ 'api_version': 'v1alpha',
16
+ }
17
+ )
18
+
19
+ async def gemini_session_handler(client_websocket: websockets.WebSocketServerProtocol):
20
+ """Handles the interaction with Gemini API within a websocket session.
21
+
22
+ Args:
23
+ client_websocket: The websocket connection to the client.
24
+ """
25
+ try:
26
+ config_message = await client_websocket.recv()
27
+ config_data = json.loads(config_message)
28
+ config = config_data.get("setup", {})
29
+ config["system_instruction"] = """You are a helpful assistant for screen sharing sessions. Your role is to:
30
+ 1) Analyze and describe the content being shared on screen
31
+ 2) Answer questions about the shared content
32
+ 3) Provide relevant information and context about what's being shown
33
+ 4) Assist with technical issues related to screen sharing
34
+ 5) Maintain a professional and helpful tone. Focus on being concise and clear in your responses."""
35
+
36
+ async with client.aio.live.connect(model=MODEL, config=config) as session:
37
+ print("Connected to Gemini API")
38
+
39
+ async def send_to_gemini():
40
+ """Sends messages from the client websocket to the Gemini API."""
41
+ try:
42
+ async for message in client_websocket:
43
+ try:
44
+ data = json.loads(message)
45
+ if "realtime_input" in data:
46
+ for chunk in data["realtime_input"]["media_chunks"]:
47
+ if chunk["mime_type"] == "audio/pcm":
48
+ await session.send({"mime_type": "audio/pcm", "data": chunk["data"]})
49
+
50
+ elif chunk["mime_type"] == "image/jpeg":
51
+ await session.send({"mime_type": "image/jpeg", "data": chunk["data"]})
52
+
53
+ except Exception as e:
54
+ print(f"Error sending to Gemini: {e}")
55
+ print("Client connection closed (send)")
56
+ except Exception as e:
57
+ print(f"Error sending to Gemini: {e}")
58
+ finally:
59
+ print("send_to_gemini closed")
60
+
61
+
62
+
63
+ async def receive_from_gemini():
64
+ """Receives responses from the Gemini API and forwards them to the client, looping until turn is complete."""
65
+ try:
66
+ while True:
67
+ try:
68
+ print("receiving from gemini")
69
+ async for response in session.receive():
70
+ if response.server_content is None:
71
+ print(f'Unhandled server message! - {response}')
72
+ continue
73
+
74
+ model_turn = response.server_content.model_turn
75
+ if model_turn:
76
+ for part in model_turn.parts:
77
+ if hasattr(part, 'text') and part.text is not None:
78
+ await client_websocket.send(json.dumps({"text": part.text}))
79
+ elif hasattr(part, 'inline_data') and part.inline_data is not None:
80
+ print("audio mime_type:", part.inline_data.mime_type)
81
+ base64_audio = base64.b64encode(part.inline_data.data).decode('utf-8')
82
+ await client_websocket.send(json.dumps({
83
+ "audio": base64_audio,
84
+ }))
85
+ print("audio received")
86
+
87
+ if response.server_content.turn_complete:
88
+ print('\n<Turn complete>')
89
+ except websockets.exceptions.ConnectionClosedOK:
90
+ print("Client connection closed normally (receive)")
91
+ break # Exit the loop if the connection is closed
92
+ except Exception as e:
93
+ print(f"Error receiving from Gemini: {e}")
94
+ break
95
+
96
+ except Exception as e:
97
+ print(f"Error receiving from Gemini: {e}")
98
+ finally:
99
+ print("Gemini connection closed (receive)")
100
+
101
+
102
+ # Start send loop
103
+ send_task = asyncio.create_task(send_to_gemini())
104
+ # Launch receive loop as a background task
105
+ receive_task = asyncio.create_task(receive_from_gemini())
106
+ await asyncio.gather(send_task, receive_task)
107
+
108
+
109
+ except Exception as e:
110
+ print(f"Error in Gemini session: {e}")
111
+ finally:
112
+ print("Gemini session closed.")
113
+
114
+
115
+ async def main() -> None:
116
+ async with websockets.serve(gemini_session_handler, "localhost", 9083):
117
+ print("Running websocket server localhost:9083...")
118
+ await asyncio.Future() # Keep the server running indefinitely
119
+
120
+
121
+ if __name__ == "__main__":
122
+ asyncio.run(main())