Spaces:

UcsTurkey
/

flare

Building

App Files Files Community

ciyidogan commited on 6 days ago

Commit

f99d306

verified ·

1 Parent(s): 165e2d0

Create voice_activity_detector.py

Browse files

Files changed (1) hide show

stt/voice_activity_detector.py +159 -0

stt/voice_activity_detector.py ADDED Viewed

	@@ -0,0 +1,159 @@

+"""
+Voice Activity Detector for Flare
+==================================
+Detects speech/silence in audio streams
+"""
+import struct
+from typing import Tuple
+from datetime import datetime
+from utils.logger import log_debug, log_warning
+class VoiceActivityDetector:
+    """Detect speech and silence in audio stream"""
+    def __init__(self,
+                 energy_threshold: float = 500,
+                 silence_threshold_ms: int = 2000,
+                 sample_rate: int = 16000):
+        """
+        Initialize VAD
+        Args:
+            energy_threshold: RMS energy threshold for speech detection
+            silence_threshold_ms: Milliseconds of silence before considering speech ended
+            sample_rate: Audio sample rate
+        """
+        self.energy_threshold = energy_threshold
+        self.silence_threshold_ms = silence_threshold_ms
+        self.sample_rate = sample_rate
+        # State tracking
+        self.is_speaking = False
+        self.silence_start: Optional[datetime] = None
+        self.speech_start: Optional[datetime] = None
+        self.last_speech_time: Optional[datetime] = None
+        # Statistics
+        self.total_speech_chunks = 0
+        self.total_silence_chunks = 0
+    def process_chunk(self, audio_chunk: bytes) -> Tuple[bool, int]:
+        """
+        Process audio chunk and detect speech/silence
+        Args:
+            audio_chunk: Raw PCM audio data (LINEAR16)
+        Returns:
+            Tuple of (is_speech, silence_duration_ms)
+        """
+        try:
+            # Calculate RMS energy
+            rms_energy = self._calculate_rms_energy(audio_chunk)
+            is_speech = rms_energy > self.energy_threshold
+            now = datetime.utcnow()
+            if is_speech:
+                # Speech detected
+                if not self.is_speaking:
+                    # Speech just started
+                    self.is_speaking = True
+                    self.speech_start = now
+                    log_debug(f"🎤 Speech started (RMS: {rms_energy})")
+                self.last_speech_time = now
+                self.silence_start = None
+                self.total_speech_chunks += 1
+                return True, 0
+            else:
+                # Silence detected
+                self.total_silence_chunks += 1
+                if self.is_speaking:
+                    # Was speaking, now silent
+                    if self.silence_start is None:
+                        self.silence_start = now
+                        log_debug(f"🔇 Silence started (RMS: {rms_energy})")
+                    # Calculate silence duration
+                    silence_duration = (now - self.silence_start).total_seconds() * 1000
+                    if silence_duration >= self.silence_threshold_ms:
+                        # Speech has ended
+                        self.is_speaking = False
+                        log_debug(f"💬 Speech ended after {silence_duration:.0f}ms of silence")
+                    return False, int(silence_duration)
+                else:
+                    # Already in silence
+                    return False, 0
+        except Exception as e:
+            log_warning(f"VAD processing error: {e}")
+            # On error, assume speech to avoid cutting off
+            return True, 0
+    def _calculate_rms_energy(self, audio_chunk: bytes) -> float:
+        """Calculate RMS energy of audio chunk"""
+        try:
+            # Handle empty or invalid chunk
+            if not audio_chunk or len(audio_chunk) < 2:
+                return 0.0
+            # Ensure even number of bytes for 16-bit audio
+            if len(audio_chunk) % 2 != 0:
+                audio_chunk = audio_chunk[:-1]
+            # Convert bytes to int16 samples
+            num_samples = len(audio_chunk) // 2
+            samples = struct.unpack(f'{num_samples}h', audio_chunk)
+            if not samples:
+                return 0.0
+            # Calculate RMS
+            sum_squares = sum(s * s for s in samples)
+            rms = (sum_squares / len(samples)) ** 0.5
+            return rms
+        except Exception as e:
+            log_warning(f"RMS calculation error: {e}")
+            return 0.0
+    def reset(self):
+        """Reset VAD state"""
+        self.is_speaking = False
+        self.silence_start = None
+        self.speech_start = None
+        self.last_speech_time = None
+        log_debug("🔄 VAD state reset")
+    def get_speech_duration(self) -> float:
+        """Get current speech duration in seconds"""
+        if self.speech_start and self.is_speaking:
+            return (datetime.utcnow() - self.speech_start).total_seconds()
+        return 0.0
+    def get_silence_duration(self) -> float:
+        """Get current silence duration in seconds"""
+        if self.silence_start and not self.is_speaking:
+            return (datetime.utcnow() - self.silence_start).total_seconds()
+        return 0.0
+    def get_stats(self) -> dict:
+        """Get VAD statistics"""
+        return {
+            "is_speaking": self.is_speaking,
+            "speech_chunks": self.total_speech_chunks,
+            "silence_chunks": self.total_silence_chunks,
+            "speech_duration": self.get_speech_duration(),
+            "silence_duration": self.get_silence_duration(),
+            "energy_threshold": self.energy_threshold,
+            "silence_threshold_ms": self.silence_threshold_ms
+        }