csukuangfj commited on
Commit
f49c03b
·
1 Parent(s): 0b64e02

update model

Browse files
app-tts.js CHANGED
@@ -1,5 +1,4 @@
1
  const generateBtn = document.getElementById('generateBtn');
2
- const hint = document.getElementById('hint');
3
  const speakerIdLabel = document.getElementById('speakerIdLabel');
4
  const speakerIdInput = document.getElementById('speakerId');
5
  const speedInput = document.getElementById('speed');
@@ -11,13 +10,41 @@ speedValue.innerHTML = speedInput.value;
11
 
12
  let index = 0;
13
 
14
-
15
  let tts = null;
16
 
17
  let audioCtx = null;
18
 
19
-
20
  Module = {};
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  Module.onRuntimeInitialized = function() {
22
  console.log('Model files downloaded!');
23
 
@@ -27,17 +54,10 @@ Module.onRuntimeInitialized = function() {
27
  speakerIdLabel.innerHTML = `Speaker ID (0 - ${tts.numSpeakers - 1}):`;
28
  }
29
 
30
- hint.innerText =
31
- 'Initialized! Please enter text and click the Generate button.';
32
-
33
-
34
-
35
  generateBtn.disabled = false;
36
  };
37
 
38
- speedInput.oninput = function() {
39
- speedValue.innerHTML = this.value;
40
- };
41
 
42
  generateBtn.onclick = function() {
43
  let speakerId = speakerIdInput.value;
@@ -69,12 +89,12 @@ generateBtn.onclick = function() {
69
  console.log('text', text);
70
 
71
  let audio =
72
- tts.generate({text: text, sid: speakerId, speed: speedInput.value});
73
 
74
  console.log(audio.samples.length, audio.sampleRate);
75
 
76
  if (!audioCtx) {
77
- audioCtx = new AudioContext({sampleRate: tts.sampleRate});
78
  }
79
 
80
  const buffer = audioCtx.createBuffer(1, audio.samples.length, tts.sampleRate);
@@ -155,22 +175,22 @@ function toWav(floatSamples, sampleRate) {
155
 
156
  // http://soundfile.sapp.org/doc/WaveFormat/
157
  // F F I R
158
- view.setUint32(0, 0x46464952, true); // chunkID
159
- view.setUint32(4, 36 + samples.length * 2, true); // chunkSize
160
  // E V A W
161
- view.setUint32(8, 0x45564157, true); // format
162
- //
163
  // t m f
164
- view.setUint32(12, 0x20746d66, true); // subchunk1ID
165
- view.setUint32(16, 16, true); // subchunk1Size, 16 for PCM
166
- view.setUint32(20, 1, true); // audioFormat, 1 for PCM
167
- view.setUint16(22, 1, true); // numChannels: 1 channel
168
- view.setUint32(24, sampleRate, true); // sampleRate
169
- view.setUint32(28, sampleRate * 2, true); // byteRate
170
- view.setUint16(32, 2, true); // blockAlign
171
- view.setUint16(34, 16, true); // bitsPerSample
172
- view.setUint32(36, 0x61746164, true); // Subchunk2ID
173
- view.setUint32(40, samples.length * 2, true); // subchunk2Size
174
 
175
  let offset = 44;
176
  for (let i = 0; i < samples.length; ++i) {
@@ -178,5 +198,5 @@ function toWav(floatSamples, sampleRate) {
178
  offset += 2;
179
  }
180
 
181
- return new Blob([view], {type: 'audio/wav'});
182
  }
 
1
  const generateBtn = document.getElementById('generateBtn');
 
2
  const speakerIdLabel = document.getElementById('speakerIdLabel');
3
  const speakerIdInput = document.getElementById('speakerId');
4
  const speedInput = document.getElementById('speed');
 
10
 
11
  let index = 0;
12
 
 
13
  let tts = null;
14
 
15
  let audioCtx = null;
16
 
 
17
  Module = {};
18
+
19
+ // https://emscripten.org/docs/api_reference/module.html#Module.locateFile
20
+ Module.locateFile = function(path, scriptDirectory = '') {
21
+ console.log(`path: ${path}, scriptDirectory: ${scriptDirectory}`);
22
+ return scriptDirectory + path;
23
+ };
24
+
25
+ // https://emscripten.org/docs/api_reference/module.html#Module.locateFile
26
+ Module.setStatus = function(status) {
27
+ console.log(`status ${status}`);
28
+ const statusElement = document.getElementById('status');
29
+ if (status == "Running...") {
30
+ status = 'Model downloaded. Initializing text to speech model...'
31
+ }
32
+ statusElement.textContent = status;
33
+ if (status === '') {
34
+ statusElement.style.display = 'none';
35
+ // statusElement.parentNode.removeChild(statusElement);
36
+
37
+ document.querySelectorAll('.tab-content').forEach((tabContentElement) => {
38
+ tabContentElement.classList.remove('loading');
39
+ });
40
+ } else {
41
+ statusElement.style.display = 'block';
42
+ document.querySelectorAll('.tab-content').forEach((tabContentElement) => {
43
+ tabContentElement.classList.add('loading');
44
+ });
45
+ }
46
+ };
47
+
48
  Module.onRuntimeInitialized = function() {
49
  console.log('Model files downloaded!');
50
 
 
54
  speakerIdLabel.innerHTML = `Speaker ID (0 - ${tts.numSpeakers - 1}):`;
55
  }
56
 
 
 
 
 
 
57
  generateBtn.disabled = false;
58
  };
59
 
60
+ speedInput.oninput = function() { speedValue.innerHTML = this.value; };
 
 
61
 
62
  generateBtn.onclick = function() {
63
  let speakerId = speakerIdInput.value;
 
89
  console.log('text', text);
90
 
91
  let audio =
92
+ tts.generate({text : text, sid : speakerId, speed : speedInput.value});
93
 
94
  console.log(audio.samples.length, audio.sampleRate);
95
 
96
  if (!audioCtx) {
97
+ audioCtx = new AudioContext({sampleRate : tts.sampleRate});
98
  }
99
 
100
  const buffer = audioCtx.createBuffer(1, audio.samples.length, tts.sampleRate);
 
175
 
176
  // http://soundfile.sapp.org/doc/WaveFormat/
177
  // F F I R
178
+ view.setUint32(0, 0x46464952, true); // chunkID
179
+ view.setUint32(4, 36 + samples.length * 2, true); // chunkSize
180
  // E V A W
181
+ view.setUint32(8, 0x45564157, true); // format
182
+ //
183
  // t m f
184
+ view.setUint32(12, 0x20746d66, true); // subchunk1ID
185
+ view.setUint32(16, 16, true); // subchunk1Size, 16 for PCM
186
+ view.setUint32(20, 1, true); // audioFormat, 1 for PCM
187
+ view.setUint16(22, 1, true); // numChannels: 1 channel
188
+ view.setUint32(24, sampleRate, true); // sampleRate
189
+ view.setUint32(28, sampleRate * 2, true); // byteRate
190
+ view.setUint16(32, 2, true); // blockAlign
191
+ view.setUint16(34, 16, true); // bitsPerSample
192
+ view.setUint32(36, 0x61746164, true); // Subchunk2ID
193
+ view.setUint32(40, samples.length * 2, true); // subchunk2Size
194
 
195
  let offset = 44;
196
  for (let i = 0; i < samples.length; ++i) {
 
198
  offset += 2;
199
  }
200
 
201
+ return new Blob([ view ], {type : 'audio/wav'});
202
  }
index.html CHANGED
@@ -11,34 +11,70 @@
11
  textarea {
12
  width:100%;
13
  }
 
 
 
14
  </style>
15
  </head>
16
 
17
- <body>
18
  <h1>
19
  Next-gen Kaldi + WebAssembly<br/>
20
  Text-to-speech Demo with <a href="https://github.com/k2-fsa/sherpa-onnx">sherpa-onnx</a>
21
  </h1>
22
- <div>
23
- <span id="hint">Loading model ... ...</span>
24
- <br/>
25
- <br/>
26
- <label for="speakerId" id="speakerIdLabel">Speaker ID: </label>
27
- <input type="text" id="speakerId" name="speakerId" value="0" />
28
- <br/>
29
- <br/>
30
- <label for="speed" id="speedLabel">Speed: </label>
31
- <input type="range" id="speed" name="speed" min="0.4" max="3.5" step="0.1" value="1.0" />
32
- <span id="speedValue"></span>
33
- <br/>
34
- <br/>
35
- <textarea id="text" rows="10" placeholder="Please enter your text here and click the Generate button"></textarea>
36
- <br/>
37
- <br/>
38
- <button id="generateBtn" disabled>Generate</button>
 
 
 
 
 
39
  </div>
40
- <section flex="1" overflow="auto" id="sound-clips">
41
- </section>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  <script src="app-tts.js"></script>
44
  <script src="sherpa-onnx-tts.js"></script>
 
11
  textarea {
12
  width:100%;
13
  }
14
+ .loading {
15
+ display: none !important;
16
+ }
17
  </style>
18
  </head>
19
 
20
+ <body style="font-family: 'Source Sans Pro', sans-serif; background-color: #f9fafb; color: #333; display: flex; flex-direction: column; align-items: center; height: 100vh; margin: 0;">
21
  <h1>
22
  Next-gen Kaldi + WebAssembly<br/>
23
  Text-to-speech Demo with <a href="https://github.com/k2-fsa/sherpa-onnx">sherpa-onnx</a>
24
  </h1>
25
+
26
+ <div style="width: 100%; max-width: 900px; background: #fff; padding: 1.5rem; border-radius: 8px; box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1); flex: 1;">
27
+ <div id="status">Loading...</div>
28
+
29
+ <div id="singleAudioContent" class="tab-content loading">
30
+ <label for="speakerId" id="speakerIdLabel">Speaker ID: </label>
31
+ <input type="text" id="speakerId" name="speakerId" value="0" />
32
+ <br/>
33
+ <br/>
34
+ <label for="speed" id="speedLabel">Speed: </label>
35
+ <input type="range" id="speed" name="speed" min="0.4" max="3.5" step="0.1" value="1.0" />
36
+ <span id="speedValue"></span>
37
+ <br/>
38
+ <br/>
39
+ <textarea id="text" rows="10" placeholder="Please enter your text here and click the Generate button"></textarea>
40
+ <br/>
41
+ <br/>
42
+ <button id="generateBtn" disabled>Generate</button>
43
+ </div>
44
+
45
+ <section flex="1" overflow="auto" id="sound-clips">
46
+ </section>
47
  </div>
48
+
49
+ <!-- Footer Section -->
50
+ <div style="width: 100%; max-width: 900px; margin-top: 1.5rem; background: #fff; padding: 1.5rem; border-radius: 8px; box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1); text-align: left; font-size: 0.9rem; color: #6c757d;">
51
+ <h3>Description</h3>
52
+ <ul>
53
+ <li>Everything is <strong>open-sourced.</strong> <a href="https://github.com/k2-fsa/sherpa-onnx">code</a></li>
54
+ <li>If you have any issues, please either <a href="https://github.com/k2-fsa/sherpa-onnx/issues">file a ticket</a> or contact us via</li>
55
+ <ul>
56
+ <li><a href="https://k2-fsa.github.io/sherpa/social-groups.html#wechat">WeChat group</a></li>
57
+ <li><a href="https://k2-fsa.github.io/sherpa/social-groups.html#qq">QQ group</a></li>
58
+ <li><a href="https://k2-fsa.github.io/sherpa/social-groups.html#bilibili-b">Bilibili</a></li>
59
+ </ul>
60
+ </ul>
61
+ <h3>About This Demo</h3>
62
+ <ul>
63
+ <li><strong>Private and Secure:</strong> All processing is done locally on your device (CPU) within your browser with a single thread. No server is involved, ensuring privacy and security. You can disconnect from the Internet once this page is loaded.</li>
64
+ <li><strong>Efficient Resource Usage:</strong> No GPU is required, leaving system resources available for webLLM analysis.</li>
65
+ </ul>
66
+ <h3>Latest Update</h3>
67
+ <ul>
68
+ <li>Update UI.</li>
69
+ <li>First working version.</li>
70
+ </ul>
71
+
72
+ <h3>Acknowledgement</h3>
73
+ <ul>
74
+ <li>We refer to <a href="https://huggingface.co/spaces/Banafo/Kroko-Streaming-ASR-Wasm">https://huggingface.co/spaces/Banafo/Kroko-Streaming-ASR-Wasm</a> for the UI part.</li>
75
+ </ul>
76
+ </div>
77
+
78
 
79
  <script src="app-tts.js"></script>
80
  <script src="sherpa-onnx-tts.js"></script>
sherpa-onnx-tts.js CHANGED
@@ -263,7 +263,7 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) {
263
 
264
  const providerLen = Module.lengthBytesUTF8(config.provider || 'cpu') + 1;
265
  const buffer = Module._malloc(providerLen);
266
- Module.stringToUTF8(config.provider, buffer, providerLen);
267
  Module.setValue(ptr + offset, buffer, 'i8*');
268
  offset += 4;
269
 
 
263
 
264
  const providerLen = Module.lengthBytesUTF8(config.provider || 'cpu') + 1;
265
  const buffer = Module._malloc(providerLen);
266
+ Module.stringToUTF8(config.provider || 'cpu', buffer, providerLen);
267
  Module.setValue(ptr + offset, buffer, 'i8*');
268
  offset += 4;
269
 
sherpa-onnx-wasm-main-tts.data CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b33e49a908d597928504c72ce5b5f6a7092ce9a9586224ccae83da01479b8492
3
- size 96574820
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a3d2f2bafcf35ce10c89565e92653c2062bc086a9bf5656df9173ce57a4098d
3
+ size 96574824
sherpa-onnx-wasm-main-tts.js CHANGED
The diff for this file is too large to render. See raw diff
 
sherpa-onnx-wasm-main-tts.wasm CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:87db632ac2778db8c2e1a090870ed96d4218dc28a4d1098f34c56d9f61951299
3
- size 11726711
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f8917e1ea904d02da511c777dcb9deee981222b0a14c71ad92fb0a1ef4452ae
3
+ size 11726942