ciyidogan commited on
Commit
f99d306
Β·
verified Β·
1 Parent(s): 165e2d0

Create voice_activity_detector.py

Browse files
Files changed (1) hide show
  1. stt/voice_activity_detector.py +159 -0
stt/voice_activity_detector.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Voice Activity Detector for Flare
3
+ ==================================
4
+ Detects speech/silence in audio streams
5
+ """
6
+ import struct
7
+ from typing import Tuple
8
+ from datetime import datetime
9
+ from utils.logger import log_debug, log_warning
10
+
11
+
12
+ class VoiceActivityDetector:
13
+ """Detect speech and silence in audio stream"""
14
+
15
+ def __init__(self,
16
+ energy_threshold: float = 500,
17
+ silence_threshold_ms: int = 2000,
18
+ sample_rate: int = 16000):
19
+ """
20
+ Initialize VAD
21
+
22
+ Args:
23
+ energy_threshold: RMS energy threshold for speech detection
24
+ silence_threshold_ms: Milliseconds of silence before considering speech ended
25
+ sample_rate: Audio sample rate
26
+ """
27
+ self.energy_threshold = energy_threshold
28
+ self.silence_threshold_ms = silence_threshold_ms
29
+ self.sample_rate = sample_rate
30
+
31
+ # State tracking
32
+ self.is_speaking = False
33
+ self.silence_start: Optional[datetime] = None
34
+ self.speech_start: Optional[datetime] = None
35
+ self.last_speech_time: Optional[datetime] = None
36
+
37
+ # Statistics
38
+ self.total_speech_chunks = 0
39
+ self.total_silence_chunks = 0
40
+
41
+ def process_chunk(self, audio_chunk: bytes) -> Tuple[bool, int]:
42
+ """
43
+ Process audio chunk and detect speech/silence
44
+
45
+ Args:
46
+ audio_chunk: Raw PCM audio data (LINEAR16)
47
+
48
+ Returns:
49
+ Tuple of (is_speech, silence_duration_ms)
50
+ """
51
+ try:
52
+ # Calculate RMS energy
53
+ rms_energy = self._calculate_rms_energy(audio_chunk)
54
+ is_speech = rms_energy > self.energy_threshold
55
+
56
+ now = datetime.utcnow()
57
+
58
+ if is_speech:
59
+ # Speech detected
60
+ if not self.is_speaking:
61
+ # Speech just started
62
+ self.is_speaking = True
63
+ self.speech_start = now
64
+ log_debug(f"🎀 Speech started (RMS: {rms_energy})")
65
+
66
+ self.last_speech_time = now
67
+ self.silence_start = None
68
+ self.total_speech_chunks += 1
69
+
70
+ return True, 0
71
+
72
+ else:
73
+ # Silence detected
74
+ self.total_silence_chunks += 1
75
+
76
+ if self.is_speaking:
77
+ # Was speaking, now silent
78
+ if self.silence_start is None:
79
+ self.silence_start = now
80
+ log_debug(f"πŸ”‡ Silence started (RMS: {rms_energy})")
81
+
82
+ # Calculate silence duration
83
+ silence_duration = (now - self.silence_start).total_seconds() * 1000
84
+
85
+ if silence_duration >= self.silence_threshold_ms:
86
+ # Speech has ended
87
+ self.is_speaking = False
88
+ log_debug(f"πŸ’¬ Speech ended after {silence_duration:.0f}ms of silence")
89
+
90
+ return False, int(silence_duration)
91
+
92
+ else:
93
+ # Already in silence
94
+ return False, 0
95
+
96
+ except Exception as e:
97
+ log_warning(f"VAD processing error: {e}")
98
+ # On error, assume speech to avoid cutting off
99
+ return True, 0
100
+
101
+ def _calculate_rms_energy(self, audio_chunk: bytes) -> float:
102
+ """Calculate RMS energy of audio chunk"""
103
+ try:
104
+ # Handle empty or invalid chunk
105
+ if not audio_chunk or len(audio_chunk) < 2:
106
+ return 0.0
107
+
108
+ # Ensure even number of bytes for 16-bit audio
109
+ if len(audio_chunk) % 2 != 0:
110
+ audio_chunk = audio_chunk[:-1]
111
+
112
+ # Convert bytes to int16 samples
113
+ num_samples = len(audio_chunk) // 2
114
+ samples = struct.unpack(f'{num_samples}h', audio_chunk)
115
+
116
+ if not samples:
117
+ return 0.0
118
+
119
+ # Calculate RMS
120
+ sum_squares = sum(s * s for s in samples)
121
+ rms = (sum_squares / len(samples)) ** 0.5
122
+
123
+ return rms
124
+
125
+ except Exception as e:
126
+ log_warning(f"RMS calculation error: {e}")
127
+ return 0.0
128
+
129
+ def reset(self):
130
+ """Reset VAD state"""
131
+ self.is_speaking = False
132
+ self.silence_start = None
133
+ self.speech_start = None
134
+ self.last_speech_time = None
135
+ log_debug("πŸ”„ VAD state reset")
136
+
137
+ def get_speech_duration(self) -> float:
138
+ """Get current speech duration in seconds"""
139
+ if self.speech_start and self.is_speaking:
140
+ return (datetime.utcnow() - self.speech_start).total_seconds()
141
+ return 0.0
142
+
143
+ def get_silence_duration(self) -> float:
144
+ """Get current silence duration in seconds"""
145
+ if self.silence_start and not self.is_speaking:
146
+ return (datetime.utcnow() - self.silence_start).total_seconds()
147
+ return 0.0
148
+
149
+ def get_stats(self) -> dict:
150
+ """Get VAD statistics"""
151
+ return {
152
+ "is_speaking": self.is_speaking,
153
+ "speech_chunks": self.total_speech_chunks,
154
+ "silence_chunks": self.total_silence_chunks,
155
+ "speech_duration": self.get_speech_duration(),
156
+ "silence_duration": self.get_silence_duration(),
157
+ "energy_threshold": self.energy_threshold,
158
+ "silence_threshold_ms": self.silence_threshold_ms
159
+ }