conversational-webgpu

Running

App Files Files Community

nisten commited on Jun 17

Commit

277ac94

verified ·

1 Parent(s): 86d4810

Update src/worker.js

Browse files

Files changed (1) hide show

src/worker.js +102 -176

src/worker.js CHANGED Viewed

@@ -26,24 +26,11 @@ import {
   MIN_SPEECH_DURATION_SAMPLES,
 } from "./constants";
-// WebGPU availability check - fail fast
-if (!navigator.gpu) {
-  self.postMessage({
-    type: "error",
-    error: new Error("WebGPU not supported. This app requires Chrome 113+, Edge 113+, or Chrome Canary with WebGPU enabled.")
-  });
-  throw new Error("WebGPU not available");
-}
-// TTS Configuration
 const model_id = "onnx-community/Kokoro-82M-v1.0-ONNX";
 let voice;
 const tts = await KokoroTTS.from_pretrained(model_id, {
-  dtype: "fp16", // Keep fp16 for memory efficiency
   device: "webgpu",
-}).catch((error) => {
-  self.postMessage({ error: new Error(`TTS loading failed: ${error.message}`) });
-  throw error;
 });
 const device = "webgpu";
@@ -54,19 +41,18 @@ self.postMessage({
   duration: "until_next",
 });
-// Load VAD model
 const silero_vad = await AutoModel.from_pretrained(
   "onnx-community/silero-vad",
   {
     config: { model_type: "custom" },
-    dtype: "fp32",
   },
 ).catch((error) => {
-  self.postMessage({ error: new Error(`VAD loading failed: ${error.message}`) });
   throw error;
 });
-// Whisper configuration
 const DEVICE_DTYPE_CONFIGS = {
   webgpu: {
     encoder_model: "fp32",
@@ -77,66 +63,38 @@ const DEVICE_DTYPE_CONFIGS = {
     decoder_model_merged: "q8",
   },
 };
 const transcriber = await pipeline(
   "automatic-speech-recognition",
-  "onnx-community/whisper-base",
   {
     device,
     dtype: DEVICE_DTYPE_CONFIGS[device],
-    // Specify language to avoid warnings
-    language: "en",
-    task: "transcribe",
   },
 ).catch((error) => {
-  self.postMessage({ error: new Error(`Whisper loading failed: ${error.message}`) });
   throw error;
 });
-// Warm up the transcriber
-await transcriber(new Float32Array(INPUT_SAMPLE_RATE));
-// LLM Configuration - Split tokenizer and model sources
-const TOKENIZER_MODEL_ID = "Qwen/Qwen3-1.7B"; // Original repo has tokenizer
-const ONNX_MODEL_ID = "onnx-community/Qwen3-1.7B-ONNX"; // ONNX weights
-// Load tokenizer from original repo
-const tokenizer = await AutoTokenizer.from_pretrained(TOKENIZER_MODEL_ID).catch((error) => {
-  self.postMessage({ error: new Error(`Tokenizer loading failed: ${error.message}`) });
-  throw error;
-});
-// Load ONNX model weights
-const llm = await AutoModelForCausalLM.from_pretrained(ONNX_MODEL_ID, {
   dtype: "q4f16",
   device: "webgpu",
-  // Add model-specific config for Qwen3
-  model_config: {
-    use_cache: true,
-    attention_bias: false,
-  }
-}).catch((error) => {
-  self.postMessage({ error: new Error(`LLM loading failed: ${error.message}`) });
-  throw error;
 });
-// System prompt optimized for conversational AI
 const SYSTEM_MESSAGE = {
   role: "system",
   content:
-    "You're a helpful and conversational voice assistant. Keep your responses short, clear, and casual. Focus on being natural and engaging in conversation.",
 };
-// Warm up the LLM
-await llm.generate({ ...tokenizer("x"), max_new_tokens: 1 });
-// Conversation state
 let messages = [SYSTEM_MESSAGE];
 let past_key_values_cache;
 let stopping_criteria;
-const MAX_CONTEXT_MESSAGES = 20; // Prevent unbounded memory growth
-// Send ready signal with available voices
 self.postMessage({
   type: "status",
   status: "ready",
@@ -144,17 +102,17 @@ self.postMessage({
   voices: tts.voices,
 });
-// Audio processing state
 const BUFFER = new Float32Array(MAX_BUFFER_DURATION * INPUT_SAMPLE_RATE);
 let bufferPointer = 0;
-// VAD state
 const sr = new Tensor("int64", [INPUT_SAMPLE_RATE], []);
 let state = new Tensor("float32", new Float32Array(2 * 1 * 128), [2, 1, 128]);
-// Recording state
 let isRecording = false;
-let isPlaying = false;
 /**
  * Perform Voice Activity Detection (VAD)
@@ -165,126 +123,86 @@ async function vad(buffer) {
   const input = new Tensor("float32", buffer, [1, buffer.length]);
   const { stateN, output } = await silero_vad({ input, sr, state });
-  state = stateN;
   const isSpeech = output.data[0];
   return (
     isSpeech > SPEECH_THRESHOLD ||
     (isRecording && isSpeech >= EXIT_THRESHOLD)
   );
 }
 /**
- * Handle speech-to-speech pipeline
  * @param {Float32Array} buffer The audio buffer
- * @param {Object} data Additional timing data
  */
 const speechToSpeech = async (buffer, data) => {
   isPlaying = true;
-  try {
-    // 1. Transcribe audio
-    const transcription = await transcriber(buffer);
-    const text = transcription.text?.trim() || "";
-    if (!text || text === "[BLANK_AUDIO]") {
-      isPlaying = false;
-      return;
     }
-    // Add user message
-    messages.push({ role: "user", content: text });
-    // Manage context window
-    if (messages.length > MAX_CONTEXT_MESSAGES) {
-      messages = [SYSTEM_MESSAGE, ...messages.slice(-(MAX_CONTEXT_MESSAGES - 1))];
-      past_key_values_cache = null; // Reset cache when context changes
-    }
-    // Set up TTS streaming
-    const splitter = new TextSplitterStream();
-    const stream = tts.stream(splitter, { voice });
-    // Stream TTS output
-    (async () => {
-      try {
-        for await (const { text, phonemes, audio } of stream) {
-          self.postMessage({ type: "output", text, result: audio });
-        }
-      } catch (error) {
-        console.error("TTS streaming error:", error);
-      }
-    })();
-    // 2. Generate LLM response
-    const inputs = tokenizer.apply_chat_template(messages, {
-      add_generation_prompt: true,
-      return_dict: true,
-      // Qwen3 specific - disable thinking mode for conversational use
-      enable_thinking: false,
-    });
-    const streamer = new TextStreamer(tokenizer, {
-      skip_prompt: true,
-      skip_special_tokens: true,
-      callback_function: (text) => {
-        splitter.push(text);
-      },
-      token_callback_function: () => {},
-    });
-    stopping_criteria = new InterruptableStoppingCriteria();
-    // Generate with appropriate settings for Qwen3
-    const { past_key_values, sequences } = await llm.generate({
-      ...inputs,
-      past_key_values: past_key_values_cache,
-      // Qwen3 optimal settings for non-thinking mode
-      do_sample: true,
-      temperature: 0.7,
-      top_p: 0.8,
-      top_k: 20,
-      max_new_tokens: 512, // Keep responses concise for voice
-      streamer,
-      stopping_criteria,
-      return_dict_in_generate: true,
-      // Ensure proper EOS handling for Qwen3
-      eos_token_id: [151643, 151645],
-      pad_token_id: tokenizer.pad_token_id,
-    });
-    past_key_values_cache = past_key_values;
-    // Close the TTS stream
-    splitter.close();
-    // Decode and store assistant response
-    const decoded = tokenizer.batch_decode(
-      sequences.slice(null, [inputs.input_ids.dims[1], null]),
-      { skip_special_tokens: true },
-    );
-    messages.push({ role: "assistant", content: decoded[0] });
-  } catch (error) {
-    console.error("Speech-to-speech error:", error);
-    self.postMessage({
-      type: "error",
-      error: new Error(`Processing failed: ${error.message}`)
-    });
-  } finally {
-    isPlaying = false;
-  }
 };
-// Audio buffer management
 let postSpeechSamples = 0;
-let prevBuffers = [];
 const resetAfterRecording = (offset = 0) => {
   self.postMessage({
     type: "status",
@@ -299,39 +217,39 @@ const resetAfterRecording = (offset = 0) => {
 };
 const dispatchForTranscriptionAndResetAudioBuffer = (overflow) => {
   const now = Date.now();
-  const end = now - ((postSpeechSamples + SPEECH_PAD_SAMPLES) / INPUT_SAMPLE_RATE) * 1000;
   const start = end - (bufferPointer / INPUT_SAMPLE_RATE) * 1000;
   const duration = end - start;
   const overflowLength = overflow?.length ?? 0;
-  // Prepare padded buffer
   const buffer = BUFFER.slice(0, bufferPointer + SPEECH_PAD_SAMPLES);
   const prevLength = prevBuffers.reduce((acc, b) => acc + b.length, 0);
   const paddedBuffer = new Float32Array(prevLength + buffer.length);
   let offset = 0;
   for (const prev of prevBuffers) {
     paddedBuffer.set(prev, offset);
     offset += prev.length;
   }
   paddedBuffer.set(buffer, offset);
-  // Process speech
   speechToSpeech(paddedBuffer, { start, end, duration });
-  // Handle overflow
   if (overflow) {
     BUFFER.set(overflow, 0);
   }
   resetAfterRecording(overflowLength);
 };
-// Message handler
 self.onmessage = async (event) => {
   const { type, buffer } = event.data;
-  // Block audio during playback
   if (type === "audio" && isPlaying) return;
   switch (type) {
@@ -343,7 +261,6 @@ self.onmessage = async (event) => {
     case "end_call":
       messages = [SYSTEM_MESSAGE];
       past_key_values_cache = null;
-      // Fall through to interrupt
     case "interrupt":
       stopping_criteria?.interrupt();
       return;
@@ -355,13 +272,15 @@ self.onmessage = async (event) => {
       return;
   }
-  // Process audio buffer
-  const wasRecording = isRecording;
   const isSpeech = await vad(buffer);
   if (!wasRecording && !isSpeech) {
-    // Queue non-speech buffers for padding
     if (prevBuffers.length >= MAX_NUM_PREV_BUFFERS) {
       prevBuffers.shift();
     }
     prevBuffers.push(buffer);
@@ -370,21 +289,25 @@ self.onmessage = async (event) => {
   const remaining = BUFFER.length - bufferPointer;
   if (buffer.length >= remaining) {
-    // Buffer overflow - trigger transcription
     BUFFER.set(buffer.subarray(0, remaining), bufferPointer);
     bufferPointer += remaining;
     const overflow = buffer.subarray(remaining);
     dispatchForTranscriptionAndResetAudioBuffer(overflow);
     return;
   } else {
-    // Add to buffer
     BUFFER.set(buffer, bufferPointer);
     bufferPointer += buffer.length;
   }
   if (isSpeech) {
     if (!isRecording) {
       self.postMessage({
         type: "status",
         status: "recording_start",
@@ -392,19 +315,25 @@ self.onmessage = async (event) => {
         duration: "until_next",
       });
     }
     isRecording = true;
-    postSpeechSamples = 0;
     return;
   }
   postSpeechSamples += buffer.length;
-  // Check for end of speech
   if (postSpeechSamples < MIN_SILENCE_DURATION_SAMPLES) {
     return;
   }
   if (bufferPointer < MIN_SPEECH_DURATION_SAMPLES) {
     resetAfterRecording();
     return;
   }
@@ -412,18 +341,15 @@ self.onmessage = async (event) => {
   dispatchForTranscriptionAndResetAudioBuffer();
 };
-// Greeting function
 function greet(text) {
   isPlaying = true;
   const splitter = new TextSplitterStream();
   const stream = tts.stream(splitter, { voice });
   (async () => {
     for await (const { text: chunkText, audio } of stream) {
       self.postMessage({ type: "output", text: chunkText, result: audio });
     }
   })();
   splitter.push(text);
   splitter.close();
   messages.push({ role: "assistant", content: text });

   MIN_SPEECH_DURATION_SAMPLES,
 } from "./constants";
 const model_id = "onnx-community/Kokoro-82M-v1.0-ONNX";
 let voice;
 const tts = await KokoroTTS.from_pretrained(model_id, {
+  dtype: "fp16",
   device: "webgpu",
 });
 const device = "webgpu";
   duration: "until_next",
 });
+// Load models
 const silero_vad = await AutoModel.from_pretrained(
   "onnx-community/silero-vad",
   {
     config: { model_type: "custom" },
+    dtype: "fp32", // Full-precision
   },
 ).catch((error) => {
+  self.postMessage({ error });
   throw error;
 });
 const DEVICE_DTYPE_CONFIGS = {
   webgpu: {
     encoder_model: "fp32",
     decoder_model_merged: "q8",
   },
 };
 const transcriber = await pipeline(
   "automatic-speech-recognition",
+  "onnx-community/whisper-base", // or "onnx-community/moonshine-base-ONNX",
   {
     device,
     dtype: DEVICE_DTYPE_CONFIGS[device],
   },
 ).catch((error) => {
+  self.postMessage({ error });
   throw error;
 });
+await transcriber(new Float32Array(INPUT_SAMPLE_RATE)); // Compile shaders
+const llm_model_id = "onnx-community/Qwen3-1.7B-ONNX";
+const tokenizer = await AutoTokenizer.from_pretrained("Qwen/Qwen3-1.7B"); // Load tokenizer from original repo
+const llm = await AutoModelForCausalLM.from_pretrained(llm_model_id, {
   dtype: "q4f16",
   device: "webgpu",
+  model_file_name: "model_q4f16.onnx" // Specify exact file to avoid external data format
 });
 const SYSTEM_MESSAGE = {
   role: "system",
   content:
+    "You're a helpful and conversational voice assistant for financial managers, you have a high EQ and are great at math and behavioral finance. Keep your responses short, clear, and casual. /no_think",
 };
+await llm.generate({ ...tokenizer("x"), max_new_tokens: 1 }); // Compile shaders
 let messages = [SYSTEM_MESSAGE];
 let past_key_values_cache;
 let stopping_criteria;
 self.postMessage({
   type: "status",
   status: "ready",
   voices: tts.voices,
 });
+// Global audio buffer to store incoming audio
 const BUFFER = new Float32Array(MAX_BUFFER_DURATION * INPUT_SAMPLE_RATE);
 let bufferPointer = 0;
+// Initial state for VAD
 const sr = new Tensor("int64", [INPUT_SAMPLE_RATE], []);
 let state = new Tensor("float32", new Float32Array(2 * 1 * 128), [2, 1, 128]);
+// Whether we are in the process of adding audio to the buffer
 let isRecording = false;
+let isPlaying = false; // new flag
 /**
  * Perform Voice Activity Detection (VAD)
   const input = new Tensor("float32", buffer, [1, buffer.length]);
   const { stateN, output } = await silero_vad({ input, sr, state });
+  state = stateN; // Update state
   const isSpeech = output.data[0];
+  // Use heuristics to determine if the buffer is speech or not
   return (
+    // Case 1: We are above the threshold (definitely speech)
     isSpeech > SPEECH_THRESHOLD ||
+    // Case 2: We are in the process of recording, and the probability is above the negative (exit) threshold
     (isRecording && isSpeech >= EXIT_THRESHOLD)
   );
 }
 /**
+ * Transcribe the audio buffer
  * @param {Float32Array} buffer The audio buffer
+ * @param {Object} data Additional data
  */
 const speechToSpeech = async (buffer, data) => {
   isPlaying = true;
+  // 1. Transcribe the audio from the user
+  const text = await transcriber(buffer).then(({ text }) => text.trim());
+  if (["", "[BLANK_AUDIO]"].includes(text)) {
+    // If the transcription is empty or a blank audio, we skip the rest of the processing
+    return;
+  }
+  messages.push({ role: "user", content: text });
+  // Set up text-to-speech streaming
+  const splitter = new TextSplitterStream();
+  const stream = tts.stream(splitter, {
+    voice,
+  });
+  (async () => {
+    for await (const { text, phonemes, audio } of stream) {
+      self.postMessage({ type: "output", text, result: audio });
     }
+  })();
+  // 2. Generate a response using the LLM
+  const inputs = tokenizer.apply_chat_template(messages, {
+    add_generation_prompt: true,
+    return_dict: true,
+  });
+  const streamer = new TextStreamer(tokenizer, {
+    skip_prompt: true,
+    skip_special_tokens: true,
+    callback_function: (text) => {
+      splitter.push(text);
+    },
+    token_callback_function: () => {},
+  });
+  stopping_criteria = new InterruptableStoppingCriteria();
+  const { past_key_values, sequences } = await llm.generate({
+    ...inputs,
+    past_key_values: past_key_values_cache,
+    do_sample: false, // TODO: do_sample: true is bugged (invalid data location on topk sample)
+    max_new_tokens: 1024,
+    streamer,
+    stopping_criteria,
+    return_dict_in_generate: true,
+  });
+  past_key_values_cache = past_key_values;
+  // Finally, close the stream to signal that no more text will be added.
+  splitter.close();
+  const decoded = tokenizer.batch_decode(
+    sequences.slice(null, [inputs.input_ids.dims[1], null]),
+    { skip_special_tokens: true },
+  );
+  messages.push({ role: "assistant", content: decoded[0] });
 };
+// Track the number of samples after the last speech chunk
 let postSpeechSamples = 0;
 const resetAfterRecording = (offset = 0) => {
   self.postMessage({
     type: "status",
 };
 const dispatchForTranscriptionAndResetAudioBuffer = (overflow) => {
+  // Get start and end time of the speech segment, minus the padding
   const now = Date.now();
+  const end =
+    now - ((postSpeechSamples + SPEECH_PAD_SAMPLES) / INPUT_SAMPLE_RATE) * 1000;
   const start = end - (bufferPointer / INPUT_SAMPLE_RATE) * 1000;
   const duration = end - start;
   const overflowLength = overflow?.length ?? 0;
+  // Send the audio buffer to the worker
   const buffer = BUFFER.slice(0, bufferPointer + SPEECH_PAD_SAMPLES);
   const prevLength = prevBuffers.reduce((acc, b) => acc + b.length, 0);
   const paddedBuffer = new Float32Array(prevLength + buffer.length);
   let offset = 0;
   for (const prev of prevBuffers) {
     paddedBuffer.set(prev, offset);
     offset += prev.length;
   }
   paddedBuffer.set(buffer, offset);
   speechToSpeech(paddedBuffer, { start, end, duration });
+  // Set overflow (if present) and reset the rest of the audio buffer
   if (overflow) {
     BUFFER.set(overflow, 0);
   }
   resetAfterRecording(overflowLength);
 };
+let prevBuffers = [];
 self.onmessage = async (event) => {
   const { type, buffer } = event.data;
+  // refuse new audio while playing back
   if (type === "audio" && isPlaying) return;
   switch (type) {
     case "end_call":
       messages = [SYSTEM_MESSAGE];
       past_key_values_cache = null;
     case "interrupt":
       stopping_criteria?.interrupt();
       return;
       return;
   }
+  const wasRecording = isRecording; // Save current state
   const isSpeech = await vad(buffer);
   if (!wasRecording && !isSpeech) {
+    // We are not recording, and the buffer is not speech,
+    // so we will probably discard the buffer. So, we insert
+    // into a FIFO queue with maximum size of PREV_BUFFER_SIZE
     if (prevBuffers.length >= MAX_NUM_PREV_BUFFERS) {
+      // If the queue is full, we discard the oldest buffer
       prevBuffers.shift();
     }
     prevBuffers.push(buffer);
   const remaining = BUFFER.length - bufferPointer;
   if (buffer.length >= remaining) {
+    // The buffer is larger than (or equal to) the remaining space in the global buffer,
+    // so we perform transcription and copy the overflow to the global buffer
     BUFFER.set(buffer.subarray(0, remaining), bufferPointer);
     bufferPointer += remaining;
+    // Dispatch the audio buffer
     const overflow = buffer.subarray(remaining);
     dispatchForTranscriptionAndResetAudioBuffer(overflow);
     return;
   } else {
+    // The buffer is smaller than the remaining space in the global buffer,
+    // so we copy it to the global buffer
     BUFFER.set(buffer, bufferPointer);
     bufferPointer += buffer.length;
   }
   if (isSpeech) {
     if (!isRecording) {
+      // Indicate start of recording
       self.postMessage({
         type: "status",
         status: "recording_start",
         duration: "until_next",
       });
     }
+    // Start or continue recording
     isRecording = true;
+    postSpeechSamples = 0; // Reset the post-speech samples
     return;
   }
   postSpeechSamples += buffer.length;
+  // At this point we're confident that we were recording (wasRecording === true), but the latest buffer is not speech.
+  // So, we check whether we have reached the end of the current audio chunk.
   if (postSpeechSamples < MIN_SILENCE_DURATION_SAMPLES) {
+    // There was a short pause, but not long enough to consider the end of a speech chunk
+    // (e.g., the speaker took a breath), so we continue recording
     return;
   }
   if (bufferPointer < MIN_SPEECH_DURATION_SAMPLES) {
+    // The entire buffer (including the new chunk) is smaller than the minimum
+    // duration of a speech chunk, so we can safely discard the buffer.
     resetAfterRecording();
     return;
   }
   dispatchForTranscriptionAndResetAudioBuffer();
 };
 function greet(text) {
   isPlaying = true;
   const splitter = new TextSplitterStream();
   const stream = tts.stream(splitter, { voice });
   (async () => {
     for await (const { text: chunkText, audio } of stream) {
       self.postMessage({ type: "output", text: chunkText, result: audio });
     }
   })();
   splitter.push(text);
   splitter.close();
   messages.push({ role: "assistant", content: text });