Commit
·
01f43f2
1
Parent(s):
e80bce6
update model
Browse files- app-vad-asr.js +47 -44
- sherpa-onnx-asr.js +37 -2
- sherpa-onnx-wasm-main-vad-asr.js +0 -0
- sherpa-onnx-wasm-main-vad-asr.wasm +2 -2
app-vad-asr.js
CHANGED
@@ -15,7 +15,7 @@ let resultList = [];
|
|
15 |
clearBtn.onclick = function() {
|
16 |
resultList = [];
|
17 |
textArea.value = getDisplayResult();
|
18 |
-
textArea.scrollTop = textArea.scrollHeight;
|
19 |
};
|
20 |
|
21 |
function getDisplayResult() {
|
@@ -46,11 +46,11 @@ let audioCtx;
|
|
46 |
let mediaStream;
|
47 |
|
48 |
let expectedSampleRate = 16000;
|
49 |
-
let recordSampleRate;
|
50 |
-
let recorder = null;
|
51 |
-
let leftchannel = [];
|
52 |
|
53 |
-
let recordingLength = 0;
|
54 |
|
55 |
let vad = null;
|
56 |
let buffer = null;
|
@@ -73,48 +73,50 @@ function createOfflineRecognizerSenseVoice() {}
|
|
73 |
|
74 |
function initOfflineRecognizer() {
|
75 |
let config = {
|
76 |
-
modelConfig
|
77 |
-
debug
|
78 |
-
tokens
|
79 |
},
|
80 |
};
|
81 |
if (fileExists('sense-voice.onnx') == 1) {
|
82 |
config.modelConfig.senseVoice = {
|
83 |
-
model
|
84 |
-
useInverseTextNormalization
|
85 |
};
|
86 |
} else if (fileExists('whisper-encoder.onnx')) {
|
87 |
config.modelConfig.whisper = {
|
88 |
-
encoder
|
89 |
-
decoder
|
90 |
};
|
91 |
} else if (fileExists('transducer-encoder.onnx')) {
|
92 |
config.modelConfig.transducer = {
|
93 |
-
encoder
|
94 |
-
decoder
|
95 |
-
joiner
|
96 |
};
|
97 |
config.modelConfig.modelType = 'transducer';
|
98 |
} else if (fileExists('nemo-transducer-encoder.onnx')) {
|
99 |
config.modelConfig.transducer = {
|
100 |
-
encoder
|
101 |
-
decoder
|
102 |
-
joiner
|
103 |
};
|
104 |
config.modelConfig.modelType = 'nemo_transducer';
|
105 |
} else if (fileExists('paraformer.onnx')) {
|
106 |
config.modelConfig.paraformer = {
|
107 |
-
model
|
108 |
};
|
109 |
} else if (fileExists('telespeech.onnx')) {
|
110 |
config.modelConfig.telespeechCtc = './telespeech.onnx';
|
111 |
} else if (fileExists('moonshine-preprocessor.onnx')) {
|
112 |
config.modelConfig.moonshine = {
|
113 |
-
preprocessor
|
114 |
-
encoder
|
115 |
-
uncachedDecoder
|
116 |
-
cachedDecoder
|
117 |
};
|
|
|
|
|
118 |
} else {
|
119 |
console.log('Please specify a model.');
|
120 |
alert('Please specify a model.');
|
@@ -133,7 +135,7 @@ Module.locateFile = function(path, scriptDirectory = '') {
|
|
133 |
Module.setStatus = function(status) {
|
134 |
console.log(`status ${status}`);
|
135 |
const statusElement = document.getElementById('status');
|
136 |
-
if (status ==
|
137 |
status = 'Model downloaded. Initializing recongizer...'
|
138 |
}
|
139 |
statusElement.textContent = status;
|
@@ -170,11 +172,11 @@ if (navigator.mediaDevices.getUserMedia) {
|
|
170 |
console.log('getUserMedia supported.');
|
171 |
|
172 |
// see https://w3c.github.io/mediacapture-main/#dom-mediadevices-getusermedia
|
173 |
-
const constraints = {audio
|
174 |
|
175 |
let onSuccess = function(stream) {
|
176 |
if (!audioCtx) {
|
177 |
-
audioCtx = new AudioContext({sampleRate
|
178 |
}
|
179 |
console.log(audioCtx);
|
180 |
recordSampleRate = audioCtx.sampleRate;
|
@@ -299,7 +301,7 @@ if (navigator.mediaDevices.getUserMedia) {
|
|
299 |
}
|
300 |
|
301 |
textArea.value = getDisplayResult();
|
302 |
-
textArea.scrollTop = textArea.scrollHeight;
|
303 |
};
|
304 |
|
305 |
startBtn.onclick = function() {
|
@@ -330,8 +332,9 @@ if (navigator.mediaDevices.getUserMedia) {
|
|
330 |
};
|
331 |
};
|
332 |
|
333 |
-
let onError = function(
|
334 |
-
|
|
|
335 |
|
336 |
navigator.mediaDevices.getUserMedia(constraints).then(onSuccess, onError);
|
337 |
} else {
|
@@ -364,22 +367,22 @@ function toWav(samples) {
|
|
364 |
|
365 |
// http://soundfile.sapp.org/doc/WaveFormat/
|
366 |
// F F I R
|
367 |
-
view.setUint32(0, 0x46464952, true);
|
368 |
-
view.setUint32(4, 36 + samples.length * 2, true);
|
369 |
// E V A W
|
370 |
-
view.setUint32(8, 0x45564157, true);
|
371 |
-
|
372 |
// t m f
|
373 |
-
view.setUint32(12, 0x20746d66, true);
|
374 |
-
view.setUint32(16, 16, true);
|
375 |
-
view.setUint32(20, 1, true);
|
376 |
-
view.setUint16(22, 1, true);
|
377 |
-
view.setUint32(24, expectedSampleRate, true);
|
378 |
-
view.setUint32(28, expectedSampleRate * 2, true);
|
379 |
-
view.setUint16(32, 2, true);
|
380 |
-
view.setUint16(34, 16, true);
|
381 |
-
view.setUint32(36, 0x61746164, true);
|
382 |
-
view.setUint32(40, samples.length * 2, true);
|
383 |
|
384 |
let offset = 44;
|
385 |
for (let i = 0; i < samples.length; ++i) {
|
@@ -387,7 +390,7 @@ function toWav(samples) {
|
|
387 |
offset += 2;
|
388 |
}
|
389 |
|
390 |
-
return new Blob([
|
391 |
}
|
392 |
|
393 |
// this function is copied from
|
|
|
15 |
clearBtn.onclick = function() {
|
16 |
resultList = [];
|
17 |
textArea.value = getDisplayResult();
|
18 |
+
textArea.scrollTop = textArea.scrollHeight; // auto scroll
|
19 |
};
|
20 |
|
21 |
function getDisplayResult() {
|
|
|
46 |
let mediaStream;
|
47 |
|
48 |
let expectedSampleRate = 16000;
|
49 |
+
let recordSampleRate; // the sampleRate of the microphone
|
50 |
+
let recorder = null; // the microphone
|
51 |
+
let leftchannel = []; // TODO: Use a single channel
|
52 |
|
53 |
+
let recordingLength = 0; // number of samples so far
|
54 |
|
55 |
let vad = null;
|
56 |
let buffer = null;
|
|
|
73 |
|
74 |
function initOfflineRecognizer() {
|
75 |
let config = {
|
76 |
+
modelConfig: {
|
77 |
+
debug: 1,
|
78 |
+
tokens: './tokens.txt',
|
79 |
},
|
80 |
};
|
81 |
if (fileExists('sense-voice.onnx') == 1) {
|
82 |
config.modelConfig.senseVoice = {
|
83 |
+
model: './sense-voice.onnx',
|
84 |
+
useInverseTextNormalization: 1,
|
85 |
};
|
86 |
} else if (fileExists('whisper-encoder.onnx')) {
|
87 |
config.modelConfig.whisper = {
|
88 |
+
encoder: './whisper-encoder.onnx',
|
89 |
+
decoder: './whisper-decoder.onnx',
|
90 |
};
|
91 |
} else if (fileExists('transducer-encoder.onnx')) {
|
92 |
config.modelConfig.transducer = {
|
93 |
+
encoder: './transducer-encoder.onnx',
|
94 |
+
decoder: './transducer-decoder.onnx',
|
95 |
+
joiner: './transducer-joiner.onnx',
|
96 |
};
|
97 |
config.modelConfig.modelType = 'transducer';
|
98 |
} else if (fileExists('nemo-transducer-encoder.onnx')) {
|
99 |
config.modelConfig.transducer = {
|
100 |
+
encoder: './nemo-transducer-encoder.onnx',
|
101 |
+
decoder: './nemo-transducer-decoder.onnx',
|
102 |
+
joiner: './nemo-transducer-joiner.onnx',
|
103 |
};
|
104 |
config.modelConfig.modelType = 'nemo_transducer';
|
105 |
} else if (fileExists('paraformer.onnx')) {
|
106 |
config.modelConfig.paraformer = {
|
107 |
+
model: './paraformer.onnx',
|
108 |
};
|
109 |
} else if (fileExists('telespeech.onnx')) {
|
110 |
config.modelConfig.telespeechCtc = './telespeech.onnx';
|
111 |
} else if (fileExists('moonshine-preprocessor.onnx')) {
|
112 |
config.modelConfig.moonshine = {
|
113 |
+
preprocessor: './moonshine-preprocessor.onnx',
|
114 |
+
encoder: './moonshine-encoder.onnx',
|
115 |
+
uncachedDecoder: './moonshine-uncached-decoder.onnx',
|
116 |
+
cachedDecoder: './moonshine-cached-decoder.onnx'
|
117 |
};
|
118 |
+
} else if (fileExists('dolphin.onnx')) {
|
119 |
+
config.modelConfig.dolphin = {model: './dolphin.onnx'};
|
120 |
} else {
|
121 |
console.log('Please specify a model.');
|
122 |
alert('Please specify a model.');
|
|
|
135 |
Module.setStatus = function(status) {
|
136 |
console.log(`status ${status}`);
|
137 |
const statusElement = document.getElementById('status');
|
138 |
+
if (status == 'Running...') {
|
139 |
status = 'Model downloaded. Initializing recongizer...'
|
140 |
}
|
141 |
statusElement.textContent = status;
|
|
|
172 |
console.log('getUserMedia supported.');
|
173 |
|
174 |
// see https://w3c.github.io/mediacapture-main/#dom-mediadevices-getusermedia
|
175 |
+
const constraints = {audio: true};
|
176 |
|
177 |
let onSuccess = function(stream) {
|
178 |
if (!audioCtx) {
|
179 |
+
audioCtx = new AudioContext({sampleRate: expectedSampleRate});
|
180 |
}
|
181 |
console.log(audioCtx);
|
182 |
recordSampleRate = audioCtx.sampleRate;
|
|
|
301 |
}
|
302 |
|
303 |
textArea.value = getDisplayResult();
|
304 |
+
textArea.scrollTop = textArea.scrollHeight; // auto scroll
|
305 |
};
|
306 |
|
307 |
startBtn.onclick = function() {
|
|
|
332 |
};
|
333 |
};
|
334 |
|
335 |
+
let onError = function(err) {
|
336 |
+
console.log('The following error occured: ' + err);
|
337 |
+
};
|
338 |
|
339 |
navigator.mediaDevices.getUserMedia(constraints).then(onSuccess, onError);
|
340 |
} else {
|
|
|
367 |
|
368 |
// http://soundfile.sapp.org/doc/WaveFormat/
|
369 |
// F F I R
|
370 |
+
view.setUint32(0, 0x46464952, true); // chunkID
|
371 |
+
view.setUint32(4, 36 + samples.length * 2, true); // chunkSize
|
372 |
// E V A W
|
373 |
+
view.setUint32(8, 0x45564157, true); // format
|
374 |
+
//
|
375 |
// t m f
|
376 |
+
view.setUint32(12, 0x20746d66, true); // subchunk1ID
|
377 |
+
view.setUint32(16, 16, true); // subchunk1Size, 16 for PCM
|
378 |
+
view.setUint32(20, 1, true); // audioFormat, 1 for PCM
|
379 |
+
view.setUint16(22, 1, true); // numChannels: 1 channel
|
380 |
+
view.setUint32(24, expectedSampleRate, true); // sampleRate
|
381 |
+
view.setUint32(28, expectedSampleRate * 2, true); // byteRate
|
382 |
+
view.setUint16(32, 2, true); // blockAlign
|
383 |
+
view.setUint16(34, 16, true); // bitsPerSample
|
384 |
+
view.setUint32(36, 0x61746164, true); // Subchunk2ID
|
385 |
+
view.setUint32(40, samples.length * 2, true); // subchunk2Size
|
386 |
|
387 |
let offset = 44;
|
388 |
for (let i = 0; i < samples.length; ++i) {
|
|
|
390 |
offset += 2;
|
391 |
}
|
392 |
|
393 |
+
return new Blob([view], {type: 'audio/wav'});
|
394 |
}
|
395 |
|
396 |
// this function is copied from
|
sherpa-onnx-asr.js
CHANGED
@@ -39,6 +39,10 @@ function freeConfig(config, Module) {
|
|
39 |
freeConfig(config.fireRedAsr, Module)
|
40 |
}
|
41 |
|
|
|
|
|
|
|
|
|
42 |
if ('moonshine' in config) {
|
43 |
freeConfig(config.moonshine, Module)
|
44 |
}
|
@@ -562,6 +566,23 @@ function initSherpaOnnxOfflineNemoEncDecCtcModelConfig(config, Module) {
|
|
562 |
}
|
563 |
}
|
564 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
565 |
function initSherpaOnnxOfflineWhisperModelConfig(config, Module) {
|
566 |
const encoderLen = Module.lengthBytesUTF8(config.encoder || '') + 1;
|
567 |
const decoderLen = Module.lengthBytesUTF8(config.decoder || '') + 1;
|
@@ -769,6 +790,12 @@ function initSherpaOnnxOfflineModelConfig(config, Module) {
|
|
769 |
};
|
770 |
}
|
771 |
|
|
|
|
|
|
|
|
|
|
|
|
|
772 |
if (!('whisper' in config)) {
|
773 |
config.whisper = {
|
774 |
encoder: '',
|
@@ -832,8 +859,12 @@ function initSherpaOnnxOfflineModelConfig(config, Module) {
|
|
832 |
const fireRedAsr =
|
833 |
initSherpaOnnxOfflineFireRedAsrModelConfig(config.fireRedAsr, Module);
|
834 |
|
|
|
|
|
|
|
835 |
const len = transducer.len + paraformer.len + nemoCtc.len + whisper.len +
|
836 |
-
tdnn.len + 8 * 4 + senseVoice.len + moonshine.len + fireRedAsr.len
|
|
|
837 |
|
838 |
const ptr = Module._malloc(len);
|
839 |
|
@@ -932,10 +963,14 @@ function initSherpaOnnxOfflineModelConfig(config, Module) {
|
|
932 |
Module._CopyHeap(fireRedAsr.ptr, fireRedAsr.len, ptr + offset);
|
933 |
offset += fireRedAsr.len;
|
934 |
|
|
|
|
|
|
|
935 |
return {
|
936 |
buffer: buffer, ptr: ptr, len: len, transducer: transducer,
|
937 |
paraformer: paraformer, nemoCtc: nemoCtc, whisper: whisper, tdnn: tdnn,
|
938 |
-
senseVoice: senseVoice, moonshine: moonshine, fireRedAsr: fireRedAsr
|
|
|
939 |
}
|
940 |
}
|
941 |
|
|
|
39 |
freeConfig(config.fireRedAsr, Module)
|
40 |
}
|
41 |
|
42 |
+
if ('dolphin' in config) {
|
43 |
+
freeConfig(config.dolphin, Module)
|
44 |
+
}
|
45 |
+
|
46 |
if ('moonshine' in config) {
|
47 |
freeConfig(config.moonshine, Module)
|
48 |
}
|
|
|
566 |
}
|
567 |
}
|
568 |
|
569 |
+
function initSherpaOnnxOfflineDolphinModelConfig(config, Module) {
|
570 |
+
const n = Module.lengthBytesUTF8(config.model || '') + 1;
|
571 |
+
|
572 |
+
const buffer = Module._malloc(n);
|
573 |
+
|
574 |
+
const len = 1 * 4; // 1 pointer
|
575 |
+
const ptr = Module._malloc(len);
|
576 |
+
|
577 |
+
Module.stringToUTF8(config.model || '', buffer, n);
|
578 |
+
|
579 |
+
Module.setValue(ptr, buffer, 'i8*');
|
580 |
+
|
581 |
+
return {
|
582 |
+
buffer: buffer, ptr: ptr, len: len,
|
583 |
+
}
|
584 |
+
}
|
585 |
+
|
586 |
function initSherpaOnnxOfflineWhisperModelConfig(config, Module) {
|
587 |
const encoderLen = Module.lengthBytesUTF8(config.encoder || '') + 1;
|
588 |
const decoderLen = Module.lengthBytesUTF8(config.decoder || '') + 1;
|
|
|
790 |
};
|
791 |
}
|
792 |
|
793 |
+
if (!('dolphin' in config)) {
|
794 |
+
config.dolphin = {
|
795 |
+
model: '',
|
796 |
+
};
|
797 |
+
}
|
798 |
+
|
799 |
if (!('whisper' in config)) {
|
800 |
config.whisper = {
|
801 |
encoder: '',
|
|
|
859 |
const fireRedAsr =
|
860 |
initSherpaOnnxOfflineFireRedAsrModelConfig(config.fireRedAsr, Module);
|
861 |
|
862 |
+
const dolphin =
|
863 |
+
initSherpaOnnxOfflineDolphinModelConfig(config.dolphin, Module);
|
864 |
+
|
865 |
const len = transducer.len + paraformer.len + nemoCtc.len + whisper.len +
|
866 |
+
tdnn.len + 8 * 4 + senseVoice.len + moonshine.len + fireRedAsr.len +
|
867 |
+
dolphin.len;
|
868 |
|
869 |
const ptr = Module._malloc(len);
|
870 |
|
|
|
963 |
Module._CopyHeap(fireRedAsr.ptr, fireRedAsr.len, ptr + offset);
|
964 |
offset += fireRedAsr.len;
|
965 |
|
966 |
+
Module._CopyHeap(dolphin.ptr, dolphin.len, ptr + offset);
|
967 |
+
offset += dolphin.len;
|
968 |
+
|
969 |
return {
|
970 |
buffer: buffer, ptr: ptr, len: len, transducer: transducer,
|
971 |
paraformer: paraformer, nemoCtc: nemoCtc, whisper: whisper, tdnn: tdnn,
|
972 |
+
senseVoice: senseVoice, moonshine: moonshine, fireRedAsr: fireRedAsr,
|
973 |
+
dolphin: dolphin
|
974 |
}
|
975 |
}
|
976 |
|
sherpa-onnx-wasm-main-vad-asr.js
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
sherpa-onnx-wasm-main-vad-asr.wasm
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7de0a408867bb4ee9ba03c63fb0a5b50f30facf9d962a27a651fb41b1c83cbfe
|
3 |
+
size 11465949
|