Spaces:
Running
Running
Delete VoiceActivityDetector.js
Browse files- VoiceActivityDetector.js +0 -109
VoiceActivityDetector.js
DELETED
@@ -1,109 +0,0 @@
|
|
1 |
-
import OnnxWrapper from './Silero.js';
|
2 |
-
|
3 |
-
const modelPath = "silero_vad.onnx"; // Make sure this path is correct
|
4 |
-
|
5 |
-
export class VadDetector {
|
6 |
-
constructor(startThreshold, endThreshold, samplingRate, minSilenceDurationMs, speechPadMs) {
|
7 |
-
if (samplingRate !== 8000 && samplingRate !== 16000) {
|
8 |
-
throw new Error("Does not support sampling rates other than [8000, 16000]");
|
9 |
-
}
|
10 |
-
|
11 |
-
this.model = new OnnxWrapper(modelPath);
|
12 |
-
this.startThreshold = startThreshold;
|
13 |
-
this.endThreshold = endThreshold;
|
14 |
-
this.samplingRate = samplingRate;
|
15 |
-
this.minSilenceSamples = samplingRate * minSilenceDurationMs / 1000;
|
16 |
-
this.speechPadSamples = samplingRate * speechPadMs / 1000;
|
17 |
-
this.reset();
|
18 |
-
console.log(`VadDetector initialized with: startThreshold=${startThreshold}, endThreshold=${endThreshold}, samplingRate=${samplingRate}`);
|
19 |
-
}
|
20 |
-
|
21 |
-
reset() {
|
22 |
-
this.model.resetStates();
|
23 |
-
this.triggered = false;
|
24 |
-
this.tempEnd = 0;
|
25 |
-
this.currentSample = 0;
|
26 |
-
console.log('VadDetector reset');
|
27 |
-
}
|
28 |
-
|
29 |
-
async apply(data, returnSeconds) {
|
30 |
-
console.log(`Applying VAD to data of length ${data.length}`);
|
31 |
-
const windowSizeSamples = data.length;
|
32 |
-
this.currentSample += windowSizeSamples;
|
33 |
-
|
34 |
-
const rowLength = this.samplingRate === 16000 ? 512 : 256;
|
35 |
-
|
36 |
-
// Ensure data is the correct length
|
37 |
-
if (data.length < rowLength) {
|
38 |
-
console.warn(`Input data length (${data.length}) is less than required (${rowLength}). Padding with zeros.`);
|
39 |
-
data = [...data, ...new Array(rowLength - data.length).fill(0)];
|
40 |
-
} else if (data.length > rowLength) {
|
41 |
-
console.warn(`Input data length (${data.length}) is greater than required (${rowLength}). Truncating.`);
|
42 |
-
data = data.slice(0, rowLength);
|
43 |
-
}
|
44 |
-
|
45 |
-
const x = [Array.from(data)];
|
46 |
-
|
47 |
-
let speechProb;
|
48 |
-
try {
|
49 |
-
console.log(`Calling model with input shape: [${x.length}, ${x[0].length}], sample rate: ${this.samplingRate}`);
|
50 |
-
const result = await this.model.call(x, this.samplingRate);
|
51 |
-
if (result && Array.isArray(result) && result[0] && result[0][0] !== undefined) {
|
52 |
-
speechProb = result[0][0];
|
53 |
-
console.log(`Speech probability: ${speechProb}`);
|
54 |
-
} else {
|
55 |
-
throw new Error("Unexpected response from model");
|
56 |
-
}
|
57 |
-
} catch (e) {
|
58 |
-
console.error("Error in VadDetector.apply:", e);
|
59 |
-
throw new Error("Error calling the model: " + e);
|
60 |
-
}
|
61 |
-
|
62 |
-
if (speechProb >= this.startThreshold && this.tempEnd !== 0) {
|
63 |
-
this.tempEnd = 0;
|
64 |
-
}
|
65 |
-
|
66 |
-
if (speechProb >= this.startThreshold && !this.triggered) {
|
67 |
-
this.triggered = true;
|
68 |
-
let speechStart = Math.max(this.currentSample - this.speechPadSamples, 0);
|
69 |
-
console.log(`Speech start detected at sample ${speechStart}`);
|
70 |
-
if (returnSeconds) {
|
71 |
-
const speechStartSeconds = speechStart / this.samplingRate;
|
72 |
-
return { start: Number(speechStartSeconds.toFixed(1)) };
|
73 |
-
} else {
|
74 |
-
return { start: speechStart };
|
75 |
-
}
|
76 |
-
}
|
77 |
-
|
78 |
-
if (speechProb < this.endThreshold && this.triggered) {
|
79 |
-
console.log(`Potential speech end at sample ${this.currentSample}`);
|
80 |
-
if (this.tempEnd === 0) {
|
81 |
-
this.tempEnd = this.currentSample;
|
82 |
-
}
|
83 |
-
|
84 |
-
if (this.currentSample - this.tempEnd < this.minSilenceSamples) {
|
85 |
-
console.log('Silence duration too short, continuing');
|
86 |
-
return {};
|
87 |
-
} else {
|
88 |
-
const speechEnd = this.tempEnd + this.speechPadSamples;
|
89 |
-
console.log(`Speech end confirmed at sample ${speechEnd}`);
|
90 |
-
this.tempEnd = 0;
|
91 |
-
this.triggered = false;
|
92 |
-
|
93 |
-
if (returnSeconds) {
|
94 |
-
const speechEndSeconds = speechEnd / this.samplingRate;
|
95 |
-
return { end: Number(speechEndSeconds.toFixed(1)) };
|
96 |
-
} else {
|
97 |
-
return { end: speechEnd };
|
98 |
-
}
|
99 |
-
}
|
100 |
-
}
|
101 |
-
|
102 |
-
return {};
|
103 |
-
}
|
104 |
-
|
105 |
-
async close() {
|
106 |
-
this.reset();
|
107 |
-
await this.model.close();
|
108 |
-
}
|
109 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|