""" Voice Activity Detector for Flare ================================== Detects speech/silence in audio streams """ import struct from typing import Tuple from datetime import datetime from utils.logger import log_debug, log_warning class VoiceActivityDetector: """Detect speech and silence in audio stream""" def __init__(self, energy_threshold: float = 500, silence_threshold_ms: int = 2000, sample_rate: int = 16000): """ Initialize VAD Args: energy_threshold: RMS energy threshold for speech detection silence_threshold_ms: Milliseconds of silence before considering speech ended sample_rate: Audio sample rate """ self.energy_threshold = energy_threshold self.silence_threshold_ms = silence_threshold_ms self.sample_rate = sample_rate # State tracking self.is_speaking = False self.silence_start: Optional[datetime] = None self.speech_start: Optional[datetime] = None self.last_speech_time: Optional[datetime] = None # Statistics self.total_speech_chunks = 0 self.total_silence_chunks = 0 def process_chunk(self, audio_chunk: bytes) -> Tuple[bool, int]: """ Process audio chunk and detect speech/silence Args: audio_chunk: Raw PCM audio data (LINEAR16) Returns: Tuple of (is_speech, silence_duration_ms) """ try: # Calculate RMS energy rms_energy = self._calculate_rms_energy(audio_chunk) is_speech = rms_energy > self.energy_threshold now = datetime.utcnow() if is_speech: # Speech detected if not self.is_speaking: # Speech just started self.is_speaking = True self.speech_start = now log_debug(f"🎤 Speech started (RMS: {rms_energy})") self.last_speech_time = now self.silence_start = None self.total_speech_chunks += 1 return True, 0 else: # Silence detected self.total_silence_chunks += 1 if self.is_speaking: # Was speaking, now silent if self.silence_start is None: self.silence_start = now log_debug(f"🔇 Silence started (RMS: {rms_energy})") # Calculate silence duration silence_duration = (now - self.silence_start).total_seconds() * 1000 if silence_duration >= self.silence_threshold_ms: # Speech has ended self.is_speaking = False log_debug(f"💬 Speech ended after {silence_duration:.0f}ms of silence") return False, int(silence_duration) else: # Already in silence return False, 0 except Exception as e: log_warning(f"VAD processing error: {e}") # On error, assume speech to avoid cutting off return True, 0 def _calculate_rms_energy(self, audio_chunk: bytes) -> float: """Calculate RMS energy of audio chunk""" try: # Handle empty or invalid chunk if not audio_chunk or len(audio_chunk) < 2: return 0.0 # Ensure even number of bytes for 16-bit audio if len(audio_chunk) % 2 != 0: audio_chunk = audio_chunk[:-1] # Convert bytes to int16 samples num_samples = len(audio_chunk) // 2 samples = struct.unpack(f'{num_samples}h', audio_chunk) if not samples: return 0.0 # Calculate RMS sum_squares = sum(s * s for s in samples) rms = (sum_squares / len(samples)) ** 0.5 return rms except Exception as e: log_warning(f"RMS calculation error: {e}") return 0.0 def reset(self): """Reset VAD state""" self.is_speaking = False self.silence_start = None self.speech_start = None self.last_speech_time = None log_debug("🔄 VAD state reset") def get_speech_duration(self) -> float: """Get current speech duration in seconds""" if self.speech_start and self.is_speaking: return (datetime.utcnow() - self.speech_start).total_seconds() return 0.0 def get_silence_duration(self) -> float: """Get current silence duration in seconds""" if self.silence_start and not self.is_speaking: return (datetime.utcnow() - self.silence_start).total_seconds() return 0.0 def get_stats(self) -> dict: """Get VAD statistics""" return { "is_speaking": self.is_speaking, "speech_chunks": self.total_speech_chunks, "silence_chunks": self.total_silence_chunks, "speech_duration": self.get_speech_duration(), "silence_duration": self.get_silence_duration(), "energy_threshold": self.energy_threshold, "silence_threshold_ms": self.silence_threshold_ms }