""" Audio Post-Processing Module ============================ Handles audio post-processing, optimization, and quality enhancement. Implements cross-fading, noise reduction, and dynamic range optimization. Optimized for Hugging Face Spaces deployment. """ import logging import time from typing import Tuple, List, Optional import numpy as np import scipy.signal from scipy.ndimage import gaussian_filter1d logger = logging.getLogger(__name__) class AudioProcessor: """Advanced audio post-processor for TTS output optimization.""" def __init__(self, crossfade_duration: float = 0.1, sample_rate: int = 16000, apply_noise_gate: bool = True, normalize_audio: bool = True): """ Initialize audio processor. Args: crossfade_duration: Duration of crossfade between chunks in seconds sample_rate: Audio sample rate apply_noise_gate: Whether to apply noise gating normalize_audio: Whether to normalize audio levels """ self.crossfade_duration = crossfade_duration self.sample_rate = sample_rate self.apply_noise_gate = apply_noise_gate self.normalize_audio = normalize_audio # Calculate crossfade samples self.crossfade_samples = int(crossfade_duration * sample_rate) logger.info(f"AudioProcessor initialized with {crossfade_duration}s crossfade") def _create_crossfade_window(self, length: int) -> Tuple[np.ndarray, np.ndarray]: """ Create crossfade windows for smooth transitions. Args: length: Length of crossfade in samples Returns: Tuple of (fade_out_window, fade_in_window) """ # Use raised cosine (Hann) window for smooth transitions window = np.hanning(2 * length) fade_out = window[:length] fade_in = window[length:] return fade_out, fade_in def crossfade_audio_segments(self, audio_segments: List[np.ndarray]) -> np.ndarray: """ Crossfade multiple audio segments for smooth concatenation. Args: audio_segments: List of audio arrays to concatenate Returns: Smoothly concatenated audio array """ if not audio_segments: return np.array([], dtype=np.int16) if len(audio_segments) == 1: return audio_segments[0] logger.debug(f"Crossfading {len(audio_segments)} audio segments") # Start with the first segment result = audio_segments[0].astype(np.float32) for i in range(1, len(audio_segments)): current_segment = audio_segments[i].astype(np.float32) # Determine crossfade length (limited by segment lengths) fade_length = min( self.crossfade_samples, len(result) // 2, len(current_segment) // 2 ) if fade_length > 0: # Create crossfade windows fade_out, fade_in = self._create_crossfade_window(fade_length) # Apply crossfade # Fade out end of result result[-fade_length:] *= fade_out # Fade in beginning of current segment current_segment[:fade_length] *= fade_in # Overlap and add overlap = result[-fade_length:] + current_segment[:fade_length] # Concatenate: result (except overlapped part) + overlap + current (except overlapped part) result = np.concatenate([ result[:-fade_length], overlap, current_segment[fade_length:] ]) else: # No crossfade possible, simple concatenation result = np.concatenate([result, current_segment]) return result.astype(np.int16) def _apply_noise_gate(self, audio: np.ndarray, threshold_db: float = -40.0) -> np.ndarray: """ Apply noise gate to reduce background noise. Args: audio: Input audio array threshold_db: Noise gate threshold in dB Returns: Noise-gated audio """ # Convert to float for processing audio_float = audio.astype(np.float32) # Calculate RMS energy in sliding window window_size = int(0.01 * self.sample_rate) # 10ms window if len(audio_float) < window_size: # For very short audio, return as-is return audio.astype(np.int16) # Pad audio for edge cases padded_audio = np.pad(audio_float, window_size//2, mode='reflect') # Calculate RMS energy rms = np.sqrt(np.convolve(padded_audio**2, np.ones(window_size)/window_size, mode='valid')) # Ensure rms has the same length as original audio if len(rms) != len(audio_float): # Resize to match original audio length from scipy.ndimage import zoom zoom_factor = len(audio_float) / len(rms) rms = zoom(rms, zoom_factor) # Convert to dB rms_db = 20 * np.log10(np.maximum(rms, 1e-10)) # Create gate mask threshold_linear = 10**(threshold_db/20) gate_mask = (rms / np.max(rms)) > threshold_linear # Smooth the gate mask to avoid clicks gate_mask = gaussian_filter1d(gate_mask.astype(float), sigma=2) # Ensure gate_mask has the same length as audio if len(gate_mask) != len(audio_float): from scipy.ndimage import zoom zoom_factor = len(audio_float) / len(gate_mask) gate_mask = zoom(gate_mask, zoom_factor) # Apply gate gated_audio = audio_float * gate_mask return gated_audio.astype(np.int16) def _normalize_audio(self, audio: np.ndarray, target_peak: float = 0.95) -> np.ndarray: """ Normalize audio to target peak level. Args: audio: Input audio array target_peak: Target peak level (0.0 to 1.0) Returns: Normalized audio """ audio_float = audio.astype(np.float32) # Find current peak current_peak = np.max(np.abs(audio_float)) if current_peak > 0: # Calculate scaling factor scale_factor = (target_peak * 32767) / current_peak # Apply scaling normalized = audio_float * scale_factor # Clip to prevent overflow normalized = np.clip(normalized, -32767, 32767) return normalized.astype(np.int16) return audio def _apply_dynamic_range_compression(self, audio: np.ndarray, ratio: float = 4.0, threshold_db: float = -12.0) -> np.ndarray: """ Apply dynamic range compression to even out volume levels. Args: audio: Input audio array ratio: Compression ratio threshold_db: Compression threshold in dB Returns: Compressed audio """ audio_float = audio.astype(np.float32) / 32767.0 # Calculate envelope envelope = np.abs(audio_float) envelope = gaussian_filter1d(envelope, sigma=int(0.001 * self.sample_rate)) # Convert to dB envelope_db = 20 * np.log10(np.maximum(envelope, 1e-10)) # Calculate gain reduction gain_reduction = np.zeros_like(envelope_db) over_threshold = envelope_db > threshold_db gain_reduction[over_threshold] = (envelope_db[over_threshold] - threshold_db) / ratio # Convert back to linear gain_linear = 10**(-gain_reduction / 20) # Apply compression compressed = audio_float * gain_linear return (compressed * 32767).astype(np.int16) def process_audio(self, audio: np.ndarray, apply_compression: bool = False, compression_ratio: float = 3.0) -> np.ndarray: """ Apply full audio processing pipeline. Args: audio: Input audio array apply_compression: Whether to apply dynamic range compression compression_ratio: Compression ratio if compression is applied Returns: Processed audio """ start_time = time.time() if len(audio) == 0: return audio processed_audio = audio.copy() try: # Apply noise gate if self.apply_noise_gate: processed_audio = self._apply_noise_gate(processed_audio) # Apply compression if requested if apply_compression: processed_audio = self._apply_dynamic_range_compression( processed_audio, ratio=compression_ratio ) # Normalize audio if self.normalize_audio: processed_audio = self._normalize_audio(processed_audio) processing_time = time.time() - start_time logger.debug(f"Audio processed in {processing_time:.3f}s") return processed_audio except Exception as e: logger.error(f"Audio processing failed: {e}") return audio # Return original audio on failure def process_and_concatenate(self, audio_segments: List[np.ndarray], apply_processing: bool = True) -> np.ndarray: """ Process and concatenate multiple audio segments. Args: audio_segments: List of audio arrays apply_processing: Whether to apply full processing pipeline Returns: Processed and concatenated audio """ if not audio_segments: return np.array([], dtype=np.int16) # First, crossfade the segments concatenated = self.crossfade_audio_segments(audio_segments) # Then apply processing if requested if apply_processing: concatenated = self.process_audio(concatenated) return concatenated def add_silence(self, audio: np.ndarray, start_silence: float = 0.1, end_silence: float = 0.1) -> np.ndarray: """ Add silence padding to audio. Args: audio: Input audio array start_silence: Silence duration at start in seconds end_silence: Silence duration at end in seconds Returns: Audio with added silence """ start_samples = int(start_silence * self.sample_rate) end_samples = int(end_silence * self.sample_rate) start_pad = np.zeros(start_samples, dtype=audio.dtype) end_pad = np.zeros(end_samples, dtype=audio.dtype) return np.concatenate([start_pad, audio, end_pad]) def get_audio_stats(self, audio: np.ndarray) -> dict: """ Get audio statistics for quality analysis. Args: audio: Audio array to analyze Returns: Dictionary of audio statistics """ if len(audio) == 0: return {"error": "Empty audio"} audio_float = audio.astype(np.float32) return { "duration_seconds": len(audio) / self.sample_rate, "sample_count": len(audio), "peak_amplitude": np.max(np.abs(audio_float)), "rms_level": np.sqrt(np.mean(audio_float**2)), "dynamic_range_db": 20 * np.log10(np.max(np.abs(audio_float)) / (np.sqrt(np.mean(audio_float**2)) + 1e-10)), "zero_crossings": np.sum(np.diff(np.signbit(audio_float))), "dc_offset": np.mean(audio_float) }