nisten commited on
Commit
277ac94
·
verified ·
1 Parent(s): 86d4810

Update src/worker.js

Browse files
Files changed (1) hide show
  1. src/worker.js +102 -176
src/worker.js CHANGED
@@ -26,24 +26,11 @@ import {
26
  MIN_SPEECH_DURATION_SAMPLES,
27
  } from "./constants";
28
 
29
- // WebGPU availability check - fail fast
30
- if (!navigator.gpu) {
31
- self.postMessage({
32
- type: "error",
33
- error: new Error("WebGPU not supported. This app requires Chrome 113+, Edge 113+, or Chrome Canary with WebGPU enabled.")
34
- });
35
- throw new Error("WebGPU not available");
36
- }
37
-
38
- // TTS Configuration
39
  const model_id = "onnx-community/Kokoro-82M-v1.0-ONNX";
40
  let voice;
41
  const tts = await KokoroTTS.from_pretrained(model_id, {
42
- dtype: "fp16", // Keep fp16 for memory efficiency
43
  device: "webgpu",
44
- }).catch((error) => {
45
- self.postMessage({ error: new Error(`TTS loading failed: ${error.message}`) });
46
- throw error;
47
  });
48
 
49
  const device = "webgpu";
@@ -54,19 +41,18 @@ self.postMessage({
54
  duration: "until_next",
55
  });
56
 
57
- // Load VAD model
58
  const silero_vad = await AutoModel.from_pretrained(
59
  "onnx-community/silero-vad",
60
  {
61
  config: { model_type: "custom" },
62
- dtype: "fp32",
63
  },
64
  ).catch((error) => {
65
- self.postMessage({ error: new Error(`VAD loading failed: ${error.message}`) });
66
  throw error;
67
  });
68
 
69
- // Whisper configuration
70
  const DEVICE_DTYPE_CONFIGS = {
71
  webgpu: {
72
  encoder_model: "fp32",
@@ -77,66 +63,38 @@ const DEVICE_DTYPE_CONFIGS = {
77
  decoder_model_merged: "q8",
78
  },
79
  };
80
-
81
  const transcriber = await pipeline(
82
  "automatic-speech-recognition",
83
- "onnx-community/whisper-base",
84
  {
85
  device,
86
  dtype: DEVICE_DTYPE_CONFIGS[device],
87
- // Specify language to avoid warnings
88
- language: "en",
89
- task: "transcribe",
90
  },
91
  ).catch((error) => {
92
- self.postMessage({ error: new Error(`Whisper loading failed: ${error.message}`) });
93
  throw error;
94
  });
95
 
96
- // Warm up the transcriber
97
- await transcriber(new Float32Array(INPUT_SAMPLE_RATE));
98
-
99
- // LLM Configuration - Split tokenizer and model sources
100
- const TOKENIZER_MODEL_ID = "Qwen/Qwen3-1.7B"; // Original repo has tokenizer
101
- const ONNX_MODEL_ID = "onnx-community/Qwen3-1.7B-ONNX"; // ONNX weights
102
-
103
- // Load tokenizer from original repo
104
- const tokenizer = await AutoTokenizer.from_pretrained(TOKENIZER_MODEL_ID).catch((error) => {
105
- self.postMessage({ error: new Error(`Tokenizer loading failed: ${error.message}`) });
106
- throw error;
107
- });
108
 
109
- // Load ONNX model weights
110
- const llm = await AutoModelForCausalLM.from_pretrained(ONNX_MODEL_ID, {
 
111
  dtype: "q4f16",
112
  device: "webgpu",
113
- // Add model-specific config for Qwen3
114
- model_config: {
115
- use_cache: true,
116
- attention_bias: false,
117
- }
118
- }).catch((error) => {
119
- self.postMessage({ error: new Error(`LLM loading failed: ${error.message}`) });
120
- throw error;
121
  });
122
 
123
- // System prompt optimized for conversational AI
124
  const SYSTEM_MESSAGE = {
125
  role: "system",
126
  content:
127
- "You're a helpful and conversational voice assistant. Keep your responses short, clear, and casual. Focus on being natural and engaging in conversation.",
128
  };
 
129
 
130
- // Warm up the LLM
131
- await llm.generate({ ...tokenizer("x"), max_new_tokens: 1 });
132
-
133
- // Conversation state
134
  let messages = [SYSTEM_MESSAGE];
135
  let past_key_values_cache;
136
  let stopping_criteria;
137
- const MAX_CONTEXT_MESSAGES = 20; // Prevent unbounded memory growth
138
-
139
- // Send ready signal with available voices
140
  self.postMessage({
141
  type: "status",
142
  status: "ready",
@@ -144,17 +102,17 @@ self.postMessage({
144
  voices: tts.voices,
145
  });
146
 
147
- // Audio processing state
148
  const BUFFER = new Float32Array(MAX_BUFFER_DURATION * INPUT_SAMPLE_RATE);
149
  let bufferPointer = 0;
150
 
151
- // VAD state
152
  const sr = new Tensor("int64", [INPUT_SAMPLE_RATE], []);
153
  let state = new Tensor("float32", new Float32Array(2 * 1 * 128), [2, 1, 128]);
154
 
155
- // Recording state
156
  let isRecording = false;
157
- let isPlaying = false;
158
 
159
  /**
160
  * Perform Voice Activity Detection (VAD)
@@ -165,126 +123,86 @@ async function vad(buffer) {
165
  const input = new Tensor("float32", buffer, [1, buffer.length]);
166
 
167
  const { stateN, output } = await silero_vad({ input, sr, state });
168
- state = stateN;
169
 
170
  const isSpeech = output.data[0];
171
 
 
172
  return (
 
173
  isSpeech > SPEECH_THRESHOLD ||
 
174
  (isRecording && isSpeech >= EXIT_THRESHOLD)
175
  );
176
  }
177
 
178
  /**
179
- * Handle speech-to-speech pipeline
180
  * @param {Float32Array} buffer The audio buffer
181
- * @param {Object} data Additional timing data
182
  */
183
  const speechToSpeech = async (buffer, data) => {
184
  isPlaying = true;
185
 
186
- try {
187
- // 1. Transcribe audio
188
- const transcription = await transcriber(buffer);
189
- const text = transcription.text?.trim() || "";
190
-
191
- if (!text || text === "[BLANK_AUDIO]") {
192
- isPlaying = false;
193
- return;
 
 
 
 
 
 
 
 
194
  }
 
195
 
196
- // Add user message
197
- messages.push({ role: "user", content: text });
 
 
 
 
 
 
 
 
 
 
 
198
 
199
- // Manage context window
200
- if (messages.length > MAX_CONTEXT_MESSAGES) {
201
- messages = [SYSTEM_MESSAGE, ...messages.slice(-(MAX_CONTEXT_MESSAGES - 1))];
202
- past_key_values_cache = null; // Reset cache when context changes
203
- }
204
 
205
- // Set up TTS streaming
206
- const splitter = new TextSplitterStream();
207
- const stream = tts.stream(splitter, { voice });
208
-
209
- // Stream TTS output
210
- (async () => {
211
- try {
212
- for await (const { text, phonemes, audio } of stream) {
213
- self.postMessage({ type: "output", text, result: audio });
214
- }
215
- } catch (error) {
216
- console.error("TTS streaming error:", error);
217
- }
218
- })();
219
-
220
- // 2. Generate LLM response
221
- const inputs = tokenizer.apply_chat_template(messages, {
222
- add_generation_prompt: true,
223
- return_dict: true,
224
- // Qwen3 specific - disable thinking mode for conversational use
225
- enable_thinking: false,
226
- });
227
-
228
- const streamer = new TextStreamer(tokenizer, {
229
- skip_prompt: true,
230
- skip_special_tokens: true,
231
- callback_function: (text) => {
232
- splitter.push(text);
233
- },
234
- token_callback_function: () => {},
235
- });
236
-
237
- stopping_criteria = new InterruptableStoppingCriteria();
238
-
239
- // Generate with appropriate settings for Qwen3
240
- const { past_key_values, sequences } = await llm.generate({
241
- ...inputs,
242
- past_key_values: past_key_values_cache,
243
-
244
- // Qwen3 optimal settings for non-thinking mode
245
- do_sample: true,
246
- temperature: 0.7,
247
- top_p: 0.8,
248
- top_k: 20,
249
- max_new_tokens: 512, // Keep responses concise for voice
250
-
251
- streamer,
252
- stopping_criteria,
253
- return_dict_in_generate: true,
254
-
255
- // Ensure proper EOS handling for Qwen3
256
- eos_token_id: [151643, 151645],
257
- pad_token_id: tokenizer.pad_token_id,
258
- });
259
-
260
- past_key_values_cache = past_key_values;
261
-
262
- // Close the TTS stream
263
- splitter.close();
264
-
265
- // Decode and store assistant response
266
- const decoded = tokenizer.batch_decode(
267
- sequences.slice(null, [inputs.input_ids.dims[1], null]),
268
- { skip_special_tokens: true },
269
- );
270
-
271
- messages.push({ role: "assistant", content: decoded[0] });
272
-
273
- } catch (error) {
274
- console.error("Speech-to-speech error:", error);
275
- self.postMessage({
276
- type: "error",
277
- error: new Error(`Processing failed: ${error.message}`)
278
- });
279
- } finally {
280
- isPlaying = false;
281
- }
282
  };
283
 
284
- // Audio buffer management
285
  let postSpeechSamples = 0;
286
- let prevBuffers = [];
287
-
288
  const resetAfterRecording = (offset = 0) => {
289
  self.postMessage({
290
  type: "status",
@@ -299,39 +217,39 @@ const resetAfterRecording = (offset = 0) => {
299
  };
300
 
301
  const dispatchForTranscriptionAndResetAudioBuffer = (overflow) => {
 
302
  const now = Date.now();
303
- const end = now - ((postSpeechSamples + SPEECH_PAD_SAMPLES) / INPUT_SAMPLE_RATE) * 1000;
 
304
  const start = end - (bufferPointer / INPUT_SAMPLE_RATE) * 1000;
305
  const duration = end - start;
306
  const overflowLength = overflow?.length ?? 0;
307
 
308
- // Prepare padded buffer
309
  const buffer = BUFFER.slice(0, bufferPointer + SPEECH_PAD_SAMPLES);
 
310
  const prevLength = prevBuffers.reduce((acc, b) => acc + b.length, 0);
311
  const paddedBuffer = new Float32Array(prevLength + buffer.length);
312
-
313
  let offset = 0;
314
  for (const prev of prevBuffers) {
315
  paddedBuffer.set(prev, offset);
316
  offset += prev.length;
317
  }
318
  paddedBuffer.set(buffer, offset);
319
-
320
- // Process speech
321
  speechToSpeech(paddedBuffer, { start, end, duration });
322
 
323
- // Handle overflow
324
  if (overflow) {
325
  BUFFER.set(overflow, 0);
326
  }
327
  resetAfterRecording(overflowLength);
328
  };
329
 
330
- // Message handler
331
  self.onmessage = async (event) => {
332
  const { type, buffer } = event.data;
333
 
334
- // Block audio during playback
335
  if (type === "audio" && isPlaying) return;
336
 
337
  switch (type) {
@@ -343,7 +261,6 @@ self.onmessage = async (event) => {
343
  case "end_call":
344
  messages = [SYSTEM_MESSAGE];
345
  past_key_values_cache = null;
346
- // Fall through to interrupt
347
  case "interrupt":
348
  stopping_criteria?.interrupt();
349
  return;
@@ -355,13 +272,15 @@ self.onmessage = async (event) => {
355
  return;
356
  }
357
 
358
- // Process audio buffer
359
- const wasRecording = isRecording;
360
  const isSpeech = await vad(buffer);
361
 
362
  if (!wasRecording && !isSpeech) {
363
- // Queue non-speech buffers for padding
 
 
364
  if (prevBuffers.length >= MAX_NUM_PREV_BUFFERS) {
 
365
  prevBuffers.shift();
366
  }
367
  prevBuffers.push(buffer);
@@ -370,21 +289,25 @@ self.onmessage = async (event) => {
370
 
371
  const remaining = BUFFER.length - bufferPointer;
372
  if (buffer.length >= remaining) {
373
- // Buffer overflow - trigger transcription
 
374
  BUFFER.set(buffer.subarray(0, remaining), bufferPointer);
375
  bufferPointer += remaining;
376
 
 
377
  const overflow = buffer.subarray(remaining);
378
  dispatchForTranscriptionAndResetAudioBuffer(overflow);
379
  return;
380
  } else {
381
- // Add to buffer
 
382
  BUFFER.set(buffer, bufferPointer);
383
  bufferPointer += buffer.length;
384
  }
385
 
386
  if (isSpeech) {
387
  if (!isRecording) {
 
388
  self.postMessage({
389
  type: "status",
390
  status: "recording_start",
@@ -392,19 +315,25 @@ self.onmessage = async (event) => {
392
  duration: "until_next",
393
  });
394
  }
 
395
  isRecording = true;
396
- postSpeechSamples = 0;
397
  return;
398
  }
399
 
400
  postSpeechSamples += buffer.length;
401
 
402
- // Check for end of speech
 
403
  if (postSpeechSamples < MIN_SILENCE_DURATION_SAMPLES) {
 
 
404
  return;
405
  }
406
 
407
  if (bufferPointer < MIN_SPEECH_DURATION_SAMPLES) {
 
 
408
  resetAfterRecording();
409
  return;
410
  }
@@ -412,18 +341,15 @@ self.onmessage = async (event) => {
412
  dispatchForTranscriptionAndResetAudioBuffer();
413
  };
414
 
415
- // Greeting function
416
  function greet(text) {
417
  isPlaying = true;
418
  const splitter = new TextSplitterStream();
419
  const stream = tts.stream(splitter, { voice });
420
-
421
  (async () => {
422
  for await (const { text: chunkText, audio } of stream) {
423
  self.postMessage({ type: "output", text: chunkText, result: audio });
424
  }
425
  })();
426
-
427
  splitter.push(text);
428
  splitter.close();
429
  messages.push({ role: "assistant", content: text });
 
26
  MIN_SPEECH_DURATION_SAMPLES,
27
  } from "./constants";
28
 
 
 
 
 
 
 
 
 
 
 
29
  const model_id = "onnx-community/Kokoro-82M-v1.0-ONNX";
30
  let voice;
31
  const tts = await KokoroTTS.from_pretrained(model_id, {
32
+ dtype: "fp16",
33
  device: "webgpu",
 
 
 
34
  });
35
 
36
  const device = "webgpu";
 
41
  duration: "until_next",
42
  });
43
 
44
+ // Load models
45
  const silero_vad = await AutoModel.from_pretrained(
46
  "onnx-community/silero-vad",
47
  {
48
  config: { model_type: "custom" },
49
+ dtype: "fp32", // Full-precision
50
  },
51
  ).catch((error) => {
52
+ self.postMessage({ error });
53
  throw error;
54
  });
55
 
 
56
  const DEVICE_DTYPE_CONFIGS = {
57
  webgpu: {
58
  encoder_model: "fp32",
 
63
  decoder_model_merged: "q8",
64
  },
65
  };
 
66
  const transcriber = await pipeline(
67
  "automatic-speech-recognition",
68
+ "onnx-community/whisper-base", // or "onnx-community/moonshine-base-ONNX",
69
  {
70
  device,
71
  dtype: DEVICE_DTYPE_CONFIGS[device],
 
 
 
72
  },
73
  ).catch((error) => {
74
+ self.postMessage({ error });
75
  throw error;
76
  });
77
 
78
+ await transcriber(new Float32Array(INPUT_SAMPLE_RATE)); // Compile shaders
 
 
 
 
 
 
 
 
 
 
 
79
 
80
+ const llm_model_id = "onnx-community/Qwen3-1.7B-ONNX";
81
+ const tokenizer = await AutoTokenizer.from_pretrained("Qwen/Qwen3-1.7B"); // Load tokenizer from original repo
82
+ const llm = await AutoModelForCausalLM.from_pretrained(llm_model_id, {
83
  dtype: "q4f16",
84
  device: "webgpu",
85
+ model_file_name: "model_q4f16.onnx" // Specify exact file to avoid external data format
 
 
 
 
 
 
 
86
  });
87
 
 
88
  const SYSTEM_MESSAGE = {
89
  role: "system",
90
  content:
91
+ "You're a helpful and conversational voice assistant for financial managers, you have a high EQ and are great at math and behavioral finance. Keep your responses short, clear, and casual. /no_think",
92
  };
93
+ await llm.generate({ ...tokenizer("x"), max_new_tokens: 1 }); // Compile shaders
94
 
 
 
 
 
95
  let messages = [SYSTEM_MESSAGE];
96
  let past_key_values_cache;
97
  let stopping_criteria;
 
 
 
98
  self.postMessage({
99
  type: "status",
100
  status: "ready",
 
102
  voices: tts.voices,
103
  });
104
 
105
+ // Global audio buffer to store incoming audio
106
  const BUFFER = new Float32Array(MAX_BUFFER_DURATION * INPUT_SAMPLE_RATE);
107
  let bufferPointer = 0;
108
 
109
+ // Initial state for VAD
110
  const sr = new Tensor("int64", [INPUT_SAMPLE_RATE], []);
111
  let state = new Tensor("float32", new Float32Array(2 * 1 * 128), [2, 1, 128]);
112
 
113
+ // Whether we are in the process of adding audio to the buffer
114
  let isRecording = false;
115
+ let isPlaying = false; // new flag
116
 
117
  /**
118
  * Perform Voice Activity Detection (VAD)
 
123
  const input = new Tensor("float32", buffer, [1, buffer.length]);
124
 
125
  const { stateN, output } = await silero_vad({ input, sr, state });
126
+ state = stateN; // Update state
127
 
128
  const isSpeech = output.data[0];
129
 
130
+ // Use heuristics to determine if the buffer is speech or not
131
  return (
132
+ // Case 1: We are above the threshold (definitely speech)
133
  isSpeech > SPEECH_THRESHOLD ||
134
+ // Case 2: We are in the process of recording, and the probability is above the negative (exit) threshold
135
  (isRecording && isSpeech >= EXIT_THRESHOLD)
136
  );
137
  }
138
 
139
  /**
140
+ * Transcribe the audio buffer
141
  * @param {Float32Array} buffer The audio buffer
142
+ * @param {Object} data Additional data
143
  */
144
  const speechToSpeech = async (buffer, data) => {
145
  isPlaying = true;
146
 
147
+ // 1. Transcribe the audio from the user
148
+ const text = await transcriber(buffer).then(({ text }) => text.trim());
149
+ if (["", "[BLANK_AUDIO]"].includes(text)) {
150
+ // If the transcription is empty or a blank audio, we skip the rest of the processing
151
+ return;
152
+ }
153
+ messages.push({ role: "user", content: text });
154
+
155
+ // Set up text-to-speech streaming
156
+ const splitter = new TextSplitterStream();
157
+ const stream = tts.stream(splitter, {
158
+ voice,
159
+ });
160
+ (async () => {
161
+ for await (const { text, phonemes, audio } of stream) {
162
+ self.postMessage({ type: "output", text, result: audio });
163
  }
164
+ })();
165
 
166
+ // 2. Generate a response using the LLM
167
+ const inputs = tokenizer.apply_chat_template(messages, {
168
+ add_generation_prompt: true,
169
+ return_dict: true,
170
+ });
171
+ const streamer = new TextStreamer(tokenizer, {
172
+ skip_prompt: true,
173
+ skip_special_tokens: true,
174
+ callback_function: (text) => {
175
+ splitter.push(text);
176
+ },
177
+ token_callback_function: () => {},
178
+ });
179
 
180
+ stopping_criteria = new InterruptableStoppingCriteria();
181
+ const { past_key_values, sequences } = await llm.generate({
182
+ ...inputs,
183
+ past_key_values: past_key_values_cache,
 
184
 
185
+ do_sample: false, // TODO: do_sample: true is bugged (invalid data location on topk sample)
186
+ max_new_tokens: 1024,
187
+ streamer,
188
+ stopping_criteria,
189
+ return_dict_in_generate: true,
190
+ });
191
+ past_key_values_cache = past_key_values;
192
+
193
+ // Finally, close the stream to signal that no more text will be added.
194
+ splitter.close();
195
+
196
+ const decoded = tokenizer.batch_decode(
197
+ sequences.slice(null, [inputs.input_ids.dims[1], null]),
198
+ { skip_special_tokens: true },
199
+ );
200
+
201
+ messages.push({ role: "assistant", content: decoded[0] });
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
  };
203
 
204
+ // Track the number of samples after the last speech chunk
205
  let postSpeechSamples = 0;
 
 
206
  const resetAfterRecording = (offset = 0) => {
207
  self.postMessage({
208
  type: "status",
 
217
  };
218
 
219
  const dispatchForTranscriptionAndResetAudioBuffer = (overflow) => {
220
+ // Get start and end time of the speech segment, minus the padding
221
  const now = Date.now();
222
+ const end =
223
+ now - ((postSpeechSamples + SPEECH_PAD_SAMPLES) / INPUT_SAMPLE_RATE) * 1000;
224
  const start = end - (bufferPointer / INPUT_SAMPLE_RATE) * 1000;
225
  const duration = end - start;
226
  const overflowLength = overflow?.length ?? 0;
227
 
228
+ // Send the audio buffer to the worker
229
  const buffer = BUFFER.slice(0, bufferPointer + SPEECH_PAD_SAMPLES);
230
+
231
  const prevLength = prevBuffers.reduce((acc, b) => acc + b.length, 0);
232
  const paddedBuffer = new Float32Array(prevLength + buffer.length);
 
233
  let offset = 0;
234
  for (const prev of prevBuffers) {
235
  paddedBuffer.set(prev, offset);
236
  offset += prev.length;
237
  }
238
  paddedBuffer.set(buffer, offset);
 
 
239
  speechToSpeech(paddedBuffer, { start, end, duration });
240
 
241
+ // Set overflow (if present) and reset the rest of the audio buffer
242
  if (overflow) {
243
  BUFFER.set(overflow, 0);
244
  }
245
  resetAfterRecording(overflowLength);
246
  };
247
 
248
+ let prevBuffers = [];
249
  self.onmessage = async (event) => {
250
  const { type, buffer } = event.data;
251
 
252
+ // refuse new audio while playing back
253
  if (type === "audio" && isPlaying) return;
254
 
255
  switch (type) {
 
261
  case "end_call":
262
  messages = [SYSTEM_MESSAGE];
263
  past_key_values_cache = null;
 
264
  case "interrupt":
265
  stopping_criteria?.interrupt();
266
  return;
 
272
  return;
273
  }
274
 
275
+ const wasRecording = isRecording; // Save current state
 
276
  const isSpeech = await vad(buffer);
277
 
278
  if (!wasRecording && !isSpeech) {
279
+ // We are not recording, and the buffer is not speech,
280
+ // so we will probably discard the buffer. So, we insert
281
+ // into a FIFO queue with maximum size of PREV_BUFFER_SIZE
282
  if (prevBuffers.length >= MAX_NUM_PREV_BUFFERS) {
283
+ // If the queue is full, we discard the oldest buffer
284
  prevBuffers.shift();
285
  }
286
  prevBuffers.push(buffer);
 
289
 
290
  const remaining = BUFFER.length - bufferPointer;
291
  if (buffer.length >= remaining) {
292
+ // The buffer is larger than (or equal to) the remaining space in the global buffer,
293
+ // so we perform transcription and copy the overflow to the global buffer
294
  BUFFER.set(buffer.subarray(0, remaining), bufferPointer);
295
  bufferPointer += remaining;
296
 
297
+ // Dispatch the audio buffer
298
  const overflow = buffer.subarray(remaining);
299
  dispatchForTranscriptionAndResetAudioBuffer(overflow);
300
  return;
301
  } else {
302
+ // The buffer is smaller than the remaining space in the global buffer,
303
+ // so we copy it to the global buffer
304
  BUFFER.set(buffer, bufferPointer);
305
  bufferPointer += buffer.length;
306
  }
307
 
308
  if (isSpeech) {
309
  if (!isRecording) {
310
+ // Indicate start of recording
311
  self.postMessage({
312
  type: "status",
313
  status: "recording_start",
 
315
  duration: "until_next",
316
  });
317
  }
318
+ // Start or continue recording
319
  isRecording = true;
320
+ postSpeechSamples = 0; // Reset the post-speech samples
321
  return;
322
  }
323
 
324
  postSpeechSamples += buffer.length;
325
 
326
+ // At this point we're confident that we were recording (wasRecording === true), but the latest buffer is not speech.
327
+ // So, we check whether we have reached the end of the current audio chunk.
328
  if (postSpeechSamples < MIN_SILENCE_DURATION_SAMPLES) {
329
+ // There was a short pause, but not long enough to consider the end of a speech chunk
330
+ // (e.g., the speaker took a breath), so we continue recording
331
  return;
332
  }
333
 
334
  if (bufferPointer < MIN_SPEECH_DURATION_SAMPLES) {
335
+ // The entire buffer (including the new chunk) is smaller than the minimum
336
+ // duration of a speech chunk, so we can safely discard the buffer.
337
  resetAfterRecording();
338
  return;
339
  }
 
341
  dispatchForTranscriptionAndResetAudioBuffer();
342
  };
343
 
 
344
  function greet(text) {
345
  isPlaying = true;
346
  const splitter = new TextSplitterStream();
347
  const stream = tts.stream(splitter, { voice });
 
348
  (async () => {
349
  for await (const { text: chunkText, audio } of stream) {
350
  self.postMessage({ type: "output", text: chunkText, result: audio });
351
  }
352
  })();
 
353
  splitter.push(text);
354
  splitter.close();
355
  messages.push({ role: "assistant", content: text });