csukuangfj commited on
Commit
98034f5
·
1 Parent(s): 2539ef2

update model

Browse files
app-vad-asr.js CHANGED
@@ -5,7 +5,6 @@
5
  const startBtn = document.getElementById('startBtn');
6
  const stopBtn = document.getElementById('stopBtn');
7
  const clearBtn = document.getElementById('clearBtn');
8
- const hint = document.getElementById('hint');
9
  const soundClips = document.getElementById('sound-clips');
10
 
11
  let textArea = document.getElementById('results');
@@ -16,7 +15,7 @@ let resultList = [];
16
  clearBtn.onclick = function() {
17
  resultList = [];
18
  textArea.value = getDisplayResult();
19
- textArea.scrollTop = textArea.scrollHeight; // auto scroll
20
  };
21
 
22
  function getDisplayResult() {
@@ -41,19 +40,17 @@ function getDisplayResult() {
41
  return ans;
42
  }
43
 
44
-
45
-
46
  Module = {};
47
 
48
  let audioCtx;
49
  let mediaStream;
50
 
51
  let expectedSampleRate = 16000;
52
- let recordSampleRate; // the sampleRate of the microphone
53
- let recorder = null; // the microphone
54
- let leftchannel = []; // TODO: Use a single channel
55
 
56
- let recordingLength = 0; // number of samples so far
57
 
58
  let vad = null;
59
  let buffer = null;
@@ -76,47 +73,47 @@ function createOfflineRecognizerSenseVoice() {}
76
 
77
  function initOfflineRecognizer() {
78
  let config = {
79
- modelConfig: {
80
- debug: 1,
81
- tokens: './tokens.txt',
82
  },
83
  };
84
  if (fileExists('sense-voice.onnx') == 1) {
85
  config.modelConfig.senseVoice = {
86
- model: './sense-voice.onnx',
87
- useInverseTextNormalization: 1,
88
  };
89
  } else if (fileExists('whisper-encoder.onnx')) {
90
  config.modelConfig.whisper = {
91
- encoder: './whisper-encoder.onnx',
92
- decoder: './whisper-decoder.onnx',
93
  };
94
  } else if (fileExists('transducer-encoder.onnx')) {
95
  config.modelConfig.transducer = {
96
- encoder: './transducer-encoder.onnx',
97
- decoder: './transducer-decoder.onnx',
98
- joiner: './transducer-joiner.onnx',
99
  };
100
  config.modelConfig.modelType = 'transducer';
101
  } else if (fileExists('nemo-transducer-encoder.onnx')) {
102
  config.modelConfig.transducer = {
103
- encoder: './nemo-transducer-encoder.onnx',
104
- decoder: './nemo-transducer-decoder.onnx',
105
- joiner: './nemo-transducer-joiner.onnx',
106
  };
107
  config.modelConfig.modelType = 'nemo_transducer';
108
  } else if (fileExists('paraformer.onnx')) {
109
  config.modelConfig.paraformer = {
110
- model: './paraformer.onnx',
111
  };
112
  } else if (fileExists('telespeech.onnx')) {
113
  config.modelConfig.telespeechCtc = './telespeech.onnx';
114
  } else if (fileExists('moonshine-preprocessor.onnx')) {
115
  config.modelConfig.moonshine = {
116
- preprocessor: './moonshine-preprocessor.onnx',
117
- encoder: './moonshine-encoder.onnx',
118
- uncachedDecoder: './moonshine-uncached-decoder.onnx',
119
- cachedDecoder: './moonshine-cached-decoder.onnx'
120
  };
121
  } else {
122
  console.log('Please specify a model.');
@@ -126,9 +123,37 @@ function initOfflineRecognizer() {
126
  recognizer = new OfflineRecognizer(config, Module);
127
  }
128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  Module.onRuntimeInitialized = function() {
130
  console.log('inited!');
131
- hint.innerText = 'Model loaded! Please click start';
132
 
133
  startBtn.disabled = false;
134
 
@@ -141,17 +166,15 @@ Module.onRuntimeInitialized = function() {
141
  initOfflineRecognizer();
142
  };
143
 
144
-
145
-
146
  if (navigator.mediaDevices.getUserMedia) {
147
  console.log('getUserMedia supported.');
148
 
149
  // see https://w3c.github.io/mediacapture-main/#dom-mediadevices-getusermedia
150
- const constraints = {audio: true};
151
 
152
  let onSuccess = function(stream) {
153
  if (!audioCtx) {
154
- audioCtx = new AudioContext({sampleRate: expectedSampleRate});
155
  }
156
  console.log(audioCtx);
157
  recordSampleRate = audioCtx.sampleRate;
@@ -219,7 +242,6 @@ if (navigator.mediaDevices.getUserMedia) {
219
 
220
  resultList.push(durationStr);
221
 
222
-
223
  // now save the segment to a wav file
224
  let buf = new Int16Array(segment.samples.length);
225
  for (var i = 0; i < segment.samples.length; ++i) {
@@ -277,7 +299,7 @@ if (navigator.mediaDevices.getUserMedia) {
277
  }
278
 
279
  textArea.value = getDisplayResult();
280
- textArea.scrollTop = textArea.scrollHeight; // auto scroll
281
  };
282
 
283
  startBtn.onclick = function() {
@@ -308,9 +330,8 @@ if (navigator.mediaDevices.getUserMedia) {
308
  };
309
  };
310
 
311
- let onError = function(err) {
312
- console.log('The following error occured: ' + err);
313
- };
314
 
315
  navigator.mediaDevices.getUserMedia(constraints).then(onSuccess, onError);
316
  } else {
@@ -318,7 +339,6 @@ if (navigator.mediaDevices.getUserMedia) {
318
  alert('getUserMedia not supported on your browser!');
319
  }
320
 
321
-
322
  // this function is copied/modified from
323
  // https://gist.github.com/meziantou/edb7217fddfbb70e899e
324
  function flatten(listOfSamples) {
@@ -344,22 +364,22 @@ function toWav(samples) {
344
 
345
  // http://soundfile.sapp.org/doc/WaveFormat/
346
  // F F I R
347
- view.setUint32(0, 0x46464952, true); // chunkID
348
- view.setUint32(4, 36 + samples.length * 2, true); // chunkSize
349
  // E V A W
350
- view.setUint32(8, 0x45564157, true); // format
351
- //
352
  // t m f
353
- view.setUint32(12, 0x20746d66, true); // subchunk1ID
354
- view.setUint32(16, 16, true); // subchunk1Size, 16 for PCM
355
- view.setUint32(20, 1, true); // audioFormat, 1 for PCM
356
- view.setUint16(22, 1, true); // numChannels: 1 channel
357
- view.setUint32(24, expectedSampleRate, true); // sampleRate
358
- view.setUint32(28, expectedSampleRate * 2, true); // byteRate
359
- view.setUint16(32, 2, true); // blockAlign
360
- view.setUint16(34, 16, true); // bitsPerSample
361
- view.setUint32(36, 0x61746164, true); // Subchunk2ID
362
- view.setUint32(40, samples.length * 2, true); // subchunk2Size
363
 
364
  let offset = 44;
365
  for (let i = 0; i < samples.length; ++i) {
@@ -367,7 +387,7 @@ function toWav(samples) {
367
  offset += 2;
368
  }
369
 
370
- return new Blob([view], {type: 'audio/wav'});
371
  }
372
 
373
  // this function is copied from
 
5
  const startBtn = document.getElementById('startBtn');
6
  const stopBtn = document.getElementById('stopBtn');
7
  const clearBtn = document.getElementById('clearBtn');
 
8
  const soundClips = document.getElementById('sound-clips');
9
 
10
  let textArea = document.getElementById('results');
 
15
  clearBtn.onclick = function() {
16
  resultList = [];
17
  textArea.value = getDisplayResult();
18
+ textArea.scrollTop = textArea.scrollHeight; // auto scroll
19
  };
20
 
21
  function getDisplayResult() {
 
40
  return ans;
41
  }
42
 
 
 
43
  Module = {};
44
 
45
  let audioCtx;
46
  let mediaStream;
47
 
48
  let expectedSampleRate = 16000;
49
+ let recordSampleRate; // the sampleRate of the microphone
50
+ let recorder = null; // the microphone
51
+ let leftchannel = []; // TODO: Use a single channel
52
 
53
+ let recordingLength = 0; // number of samples so far
54
 
55
  let vad = null;
56
  let buffer = null;
 
73
 
74
  function initOfflineRecognizer() {
75
  let config = {
76
+ modelConfig : {
77
+ debug : 1,
78
+ tokens : './tokens.txt',
79
  },
80
  };
81
  if (fileExists('sense-voice.onnx') == 1) {
82
  config.modelConfig.senseVoice = {
83
+ model : './sense-voice.onnx',
84
+ useInverseTextNormalization : 1,
85
  };
86
  } else if (fileExists('whisper-encoder.onnx')) {
87
  config.modelConfig.whisper = {
88
+ encoder : './whisper-encoder.onnx',
89
+ decoder : './whisper-decoder.onnx',
90
  };
91
  } else if (fileExists('transducer-encoder.onnx')) {
92
  config.modelConfig.transducer = {
93
+ encoder : './transducer-encoder.onnx',
94
+ decoder : './transducer-decoder.onnx',
95
+ joiner : './transducer-joiner.onnx',
96
  };
97
  config.modelConfig.modelType = 'transducer';
98
  } else if (fileExists('nemo-transducer-encoder.onnx')) {
99
  config.modelConfig.transducer = {
100
+ encoder : './nemo-transducer-encoder.onnx',
101
+ decoder : './nemo-transducer-decoder.onnx',
102
+ joiner : './nemo-transducer-joiner.onnx',
103
  };
104
  config.modelConfig.modelType = 'nemo_transducer';
105
  } else if (fileExists('paraformer.onnx')) {
106
  config.modelConfig.paraformer = {
107
+ model : './paraformer.onnx',
108
  };
109
  } else if (fileExists('telespeech.onnx')) {
110
  config.modelConfig.telespeechCtc = './telespeech.onnx';
111
  } else if (fileExists('moonshine-preprocessor.onnx')) {
112
  config.modelConfig.moonshine = {
113
+ preprocessor : './moonshine-preprocessor.onnx',
114
+ encoder : './moonshine-encoder.onnx',
115
+ uncachedDecoder : './moonshine-uncached-decoder.onnx',
116
+ cachedDecoder : './moonshine-cached-decoder.onnx'
117
  };
118
  } else {
119
  console.log('Please specify a model.');
 
123
  recognizer = new OfflineRecognizer(config, Module);
124
  }
125
 
126
+ // https://emscripten.org/docs/api_reference/module.html#Module.locateFile
127
+ Module.locateFile = function(path, scriptDirectory = '') {
128
+ console.log(`path: ${path}, scriptDirectory: ${scriptDirectory}`);
129
+ return scriptDirectory + path;
130
+ };
131
+
132
+ // https://emscripten.org/docs/api_reference/module.html#Module.locateFile
133
+ Module.setStatus = function(status) {
134
+ console.log(`status ${status}`);
135
+ const statusElement = document.getElementById('status');
136
+ if (status == "Running...") {
137
+ status = 'Model downloaded. Initializing recongizer...'
138
+ }
139
+ statusElement.textContent = status;
140
+ if (status === '') {
141
+ statusElement.style.display = 'none';
142
+ // statusElement.parentNode.removeChild(statusElement);
143
+
144
+ document.querySelectorAll('.tab-content').forEach((tabContentElement) => {
145
+ tabContentElement.classList.remove('loading');
146
+ });
147
+ } else {
148
+ statusElement.style.display = 'block';
149
+ document.querySelectorAll('.tab-content').forEach((tabContentElement) => {
150
+ tabContentElement.classList.add('loading');
151
+ });
152
+ }
153
+ };
154
+
155
  Module.onRuntimeInitialized = function() {
156
  console.log('inited!');
 
157
 
158
  startBtn.disabled = false;
159
 
 
166
  initOfflineRecognizer();
167
  };
168
 
 
 
169
  if (navigator.mediaDevices.getUserMedia) {
170
  console.log('getUserMedia supported.');
171
 
172
  // see https://w3c.github.io/mediacapture-main/#dom-mediadevices-getusermedia
173
+ const constraints = {audio : true};
174
 
175
  let onSuccess = function(stream) {
176
  if (!audioCtx) {
177
+ audioCtx = new AudioContext({sampleRate : expectedSampleRate});
178
  }
179
  console.log(audioCtx);
180
  recordSampleRate = audioCtx.sampleRate;
 
242
 
243
  resultList.push(durationStr);
244
 
 
245
  // now save the segment to a wav file
246
  let buf = new Int16Array(segment.samples.length);
247
  for (var i = 0; i < segment.samples.length; ++i) {
 
299
  }
300
 
301
  textArea.value = getDisplayResult();
302
+ textArea.scrollTop = textArea.scrollHeight; // auto scroll
303
  };
304
 
305
  startBtn.onclick = function() {
 
330
  };
331
  };
332
 
333
+ let onError = function(
334
+ err) { console.log('The following error occured: ' + err); };
 
335
 
336
  navigator.mediaDevices.getUserMedia(constraints).then(onSuccess, onError);
337
  } else {
 
339
  alert('getUserMedia not supported on your browser!');
340
  }
341
 
 
342
  // this function is copied/modified from
343
  // https://gist.github.com/meziantou/edb7217fddfbb70e899e
344
  function flatten(listOfSamples) {
 
364
 
365
  // http://soundfile.sapp.org/doc/WaveFormat/
366
  // F F I R
367
+ view.setUint32(0, 0x46464952, true); // chunkID
368
+ view.setUint32(4, 36 + samples.length * 2, true); // chunkSize
369
  // E V A W
370
+ view.setUint32(8, 0x45564157, true); // format
371
+ //
372
  // t m f
373
+ view.setUint32(12, 0x20746d66, true); // subchunk1ID
374
+ view.setUint32(16, 16, true); // subchunk1Size, 16 for PCM
375
+ view.setUint32(20, 1, true); // audioFormat, 1 for PCM
376
+ view.setUint16(22, 1, true); // numChannels: 1 channel
377
+ view.setUint32(24, expectedSampleRate, true); // sampleRate
378
+ view.setUint32(28, expectedSampleRate * 2, true); // byteRate
379
+ view.setUint16(32, 2, true); // blockAlign
380
+ view.setUint16(34, 16, true); // bitsPerSample
381
+ view.setUint32(36, 0x61746164, true); // Subchunk2ID
382
+ view.setUint32(40, samples.length * 2, true); // subchunk2Size
383
 
384
  let offset = 44;
385
  for (let i = 0; i < samples.length; ++i) {
 
387
  offset += 2;
388
  }
389
 
390
+ return new Blob([ view ], {type : 'audio/wav'});
391
  }
392
 
393
  // this function is copied from
index.html CHANGED
@@ -11,30 +11,68 @@
11
  textarea {
12
  width:100%;
13
  }
 
 
 
14
  </style>
15
  </head>
16
 
17
- <body>
18
  <h1>
19
  Next-gen Kaldi + WebAssembly<br/>
20
  VAD+ASR Demo with <a href="https://github.com/k2-fsa/sherpa-onnx">sherpa-onnx</a><br/>
21
  (with Zipformer supporting Japanese 日语)
22
  </h1>
23
 
24
- <div>
25
- <span id="hint">Loading model ... ...</span>
26
- <br/>
27
- <br/>
28
- <button id="startBtn" disabled>Start</button>
29
- <button id="stopBtn" disabled>Stop</button>
30
- <button id="clearBtn">Clear</button>
31
- <br/>
32
- <br/>
33
- <textarea id="results" rows="10" readonly></textarea>
 
 
 
 
 
 
 
 
 
34
  </div>
35
 
36
- <section flex="1" overflow="auto" id="sound-clips">
37
- </section>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  <script src="sherpa-onnx-asr.js"></script>
40
  <script src="sherpa-onnx-vad.js"></script>
 
11
  textarea {
12
  width:100%;
13
  }
14
+ .loading {
15
+ display: none !important;
16
+ }
17
  </style>
18
  </head>
19
 
20
+ <body style="font-family: 'Source Sans Pro', sans-serif; background-color: #f9fafb; color: #333; display: flex; flex-direction: column; align-items: center; height: 100vh; margin: 0;">
21
  <h1>
22
  Next-gen Kaldi + WebAssembly<br/>
23
  VAD+ASR Demo with <a href="https://github.com/k2-fsa/sherpa-onnx">sherpa-onnx</a><br/>
24
  (with Zipformer supporting Japanese 日语)
25
  </h1>
26
 
27
+ <div style="width: 100%; max-width: 900px; background: #fff; padding: 1.5rem; border-radius: 8px; box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1); flex: 1;">
28
+ <div id="status">Loading...</div>
29
+
30
+ <div id="singleAudioContent" class="tab-content loading">
31
+ <div style="display: flex; gap: 1.5rem;">
32
+ <div style="flex: 1; display: flex; flex-direction: row; align-items: center; gap: 1rem;">
33
+ <button id="startBtn" disabled>Start</button>
34
+ <button id="stopBtn" disabled>Stop</button>
35
+ <button id="clearBtn">Clear</button>
36
+ </div>
37
+ </div>
38
+
39
+ <div style="flex: 1; display: flex; flex-direction: column; gap: 1rem;">
40
+ <div style="font-size: 1rem; font-weight: bold; padding: 0.5rem 1rem; background-color: #f8f9fa; border-radius: 8px; color: #6c757d;">Transcript</div>
41
+ <textarea id="results" rows="10" placeholder="Output will appear here..." readonly style="flex: 1; padding: 0.75rem; font-size: 1rem; border: 1px solid #ced4da; border-radius: 8px; resize: none; background-color: #f8f9fa;"></textarea>
42
+ </div>
43
+
44
+ <section flex="1" overflow="auto" id="sound-clips">
45
+ </section>
46
  </div>
47
 
48
+ <!-- Footer Section -->
49
+ <div style="width: 100%; max-width: 900px; margin-top: 1.5rem; background: #fff; padding: 1.5rem; border-radius: 8px; box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1); text-align: left; font-size: 0.9rem; color: #6c757d;">
50
+ <h3>Description</h3>
51
+ <ul>
52
+ <li>Everything is <strong>open-sourced.</strong> <a href="https://github.com/k2-fsa/sherpa-onnx">code</a></li>
53
+ <li>If you have any issues, please either <a href="https://github.com/k2-fsa/sherpa-onnx/issues">file a ticket</a> or contact us via</li>
54
+ <ul>
55
+ <li><a href="https://k2-fsa.github.io/sherpa/social-groups.html#wechat">WeChat group</a></li>
56
+ <li><a href="https://k2-fsa.github.io/sherpa/social-groups.html#qq">QQ group</a></li>
57
+ <li><a href="https://k2-fsa.github.io/sherpa/social-groups.html#bilibili-b">Bilibili</a></li>
58
+ </ul>
59
+ </ul>
60
+ <h3>About This Demo</h3>
61
+ <ul>
62
+ <li><strong>Private and Secure:</strong> All processing is done locally on your device (CPU) within your browser with a single thread. No server is involved, ensuring privacy and security. You can disconnect from the Internet once this page is loaded.</li>
63
+ <li><strong>Efficient Resource Usage:</strong> No GPU is required, leaving system resources available for webLLM analysis.</li>
64
+ </ul>
65
+ <h3>Latest Update</h3>
66
+ <ul>
67
+ <li>Update UI.</li>
68
+ <li>First working version.</li>
69
+ </ul>
70
+
71
+ <h3>Acknowledgement</h3>
72
+ <ul>
73
+ <li>We refer to <a href="https://huggingface.co/spaces/Banafo/Kroko-Streaming-ASR-Wasm">https://huggingface.co/spaces/Banafo/Kroko-Streaming-ASR-Wasm</a> for the UI part.</li>
74
+ </ul>
75
+ </div>
76
 
77
  <script src="sherpa-onnx-asr.js"></script>
78
  <script src="sherpa-onnx-vad.js"></script>
sherpa-onnx-wasm-main-vad-asr.js CHANGED
The diff for this file is too large to render. See raw diff
 
sherpa-onnx-wasm-main-vad-asr.wasm CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b81ca128ee22079c23d1396aa5c1caaa2f2d62b10b2f31c737a82d94690e0daa
3
- size 11454080
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efd6fa873b373bdde2dd96ddca696f0cdd47e838f0cf715311f8f141b5f250d8
3
+ size 11460886