Spaces:

Edmon02
/

SpeechT5_hy

Runtime error

File size: 12,624 Bytes

b163aa7
 
 
 
 
 
797f6a7
b163aa7

"""
Audio Post-Processing Module
============================

Handles audio post-processing, optimization, and quality enhancement.
Implements cross-fading, noise reduction, and dynamic range optimization.
Optimized for Hugging Face Spaces deployment.
"""

import logging
import time
from typing import Tuple, List, Optional
import numpy as np
import scipy.signal
from scipy.ndimage import gaussian_filter1d

logger = logging.getLogger(__name__)


class AudioProcessor:
    """Advanced audio post-processor for TTS output optimization."""
    
    def __init__(self, 
                 crossfade_duration: float = 0.1,
                 sample_rate: int = 16000,
                 apply_noise_gate: bool = True,
                 normalize_audio: bool = True):
        """
        Initialize audio processor.
        
        Args:
            crossfade_duration: Duration of crossfade between chunks in seconds
            sample_rate: Audio sample rate
            apply_noise_gate: Whether to apply noise gating
            normalize_audio: Whether to normalize audio levels
        """
        self.crossfade_duration = crossfade_duration
        self.sample_rate = sample_rate
        self.apply_noise_gate = apply_noise_gate
        self.normalize_audio = normalize_audio
        
        # Calculate crossfade samples
        self.crossfade_samples = int(crossfade_duration * sample_rate)
        
        logger.info(f"AudioProcessor initialized with {crossfade_duration}s crossfade")
    
    def _create_crossfade_window(self, length: int) -> Tuple[np.ndarray, np.ndarray]:
        """
        Create crossfade windows for smooth transitions.
        
        Args:
            length: Length of crossfade in samples
            
        Returns:
            Tuple of (fade_out_window, fade_in_window)
        """
        # Use raised cosine (Hann) window for smooth transitions
        window = np.hanning(2 * length)
        fade_out = window[:length]
        fade_in = window[length:]
        
        return fade_out, fade_in
    
    def crossfade_audio_segments(self, audio_segments: List[np.ndarray]) -> np.ndarray:
        """
        Crossfade multiple audio segments for smooth concatenation.
        
        Args:
            audio_segments: List of audio arrays to concatenate
            
        Returns:
            Smoothly concatenated audio array
        """
        if not audio_segments:
            return np.array([], dtype=np.int16)
        
        if len(audio_segments) == 1:
            return audio_segments[0]
        
        logger.debug(f"Crossfading {len(audio_segments)} audio segments")
        
        # Start with the first segment
        result = audio_segments[0].astype(np.float32)
        
        for i in range(1, len(audio_segments)):
            current_segment = audio_segments[i].astype(np.float32)
            
            # Determine crossfade length (limited by segment lengths)
            fade_length = min(
                self.crossfade_samples,
                len(result) // 2,
                len(current_segment) // 2
            )
            
            if fade_length > 0:
                # Create crossfade windows
                fade_out, fade_in = self._create_crossfade_window(fade_length)
                
                # Apply crossfade
                # Fade out end of result
                result[-fade_length:] *= fade_out
                
                # Fade in beginning of current segment
                current_segment[:fade_length] *= fade_in
                
                # Overlap and add
                overlap = result[-fade_length:] + current_segment[:fade_length]
                
                # Concatenate: result (except overlapped part) + overlap + current (except overlapped part)
                result = np.concatenate([
                    result[:-fade_length],
                    overlap,
                    current_segment[fade_length:]
                ])
            else:
                # No crossfade possible, simple concatenation
                result = np.concatenate([result, current_segment])
        
        return result.astype(np.int16)
    
    def _apply_noise_gate(self, audio: np.ndarray, threshold_db: float = -40.0) -> np.ndarray:
        """
        Apply noise gate to reduce background noise.
        
        Args:
            audio: Input audio array
            threshold_db: Noise gate threshold in dB
            
        Returns:
            Noise-gated audio
        """
        # Convert to float for processing
        audio_float = audio.astype(np.float32)
        
        # Calculate RMS energy in sliding window
        window_size = int(0.01 * self.sample_rate)  # 10ms window
        
        if len(audio_float) < window_size:
            # For very short audio, return as-is
            return audio.astype(np.int16)
        
        # Pad audio for edge cases
        padded_audio = np.pad(audio_float, window_size//2, mode='reflect')
        
        # Calculate RMS energy
        rms = np.sqrt(np.convolve(padded_audio**2, 
                                  np.ones(window_size)/window_size, 
                                  mode='valid'))
        
        # Ensure rms has the same length as original audio
        if len(rms) != len(audio_float):
            # Resize to match original audio length
            from scipy.ndimage import zoom
            zoom_factor = len(audio_float) / len(rms)
            rms = zoom(rms, zoom_factor)
        
        # Convert to dB
        rms_db = 20 * np.log10(np.maximum(rms, 1e-10))
        
        # Create gate mask
        threshold_linear = 10**(threshold_db/20)
        gate_mask = (rms / np.max(rms)) > threshold_linear
        
        # Smooth the gate mask to avoid clicks
        gate_mask = gaussian_filter1d(gate_mask.astype(float), sigma=2)
        
        # Ensure gate_mask has the same length as audio
        if len(gate_mask) != len(audio_float):
            from scipy.ndimage import zoom
            zoom_factor = len(audio_float) / len(gate_mask)
            gate_mask = zoom(gate_mask, zoom_factor)
        
        # Apply gate
        gated_audio = audio_float * gate_mask
        
        return gated_audio.astype(np.int16)
    
    def _normalize_audio(self, audio: np.ndarray, target_peak: float = 0.95) -> np.ndarray:
        """
        Normalize audio to target peak level.
        
        Args:
            audio: Input audio array
            target_peak: Target peak level (0.0 to 1.0)
            
        Returns:
            Normalized audio
        """
        audio_float = audio.astype(np.float32)
        
        # Find current peak
        current_peak = np.max(np.abs(audio_float))
        
        if current_peak > 0:
            # Calculate scaling factor
            scale_factor = (target_peak * 32767) / current_peak
            
            # Apply scaling
            normalized = audio_float * scale_factor
            
            # Clip to prevent overflow
            normalized = np.clip(normalized, -32767, 32767)
            
            return normalized.astype(np.int16)
        
        return audio
    
    def _apply_dynamic_range_compression(self, audio: np.ndarray, 
                                        ratio: float = 4.0, 
                                        threshold_db: float = -12.0) -> np.ndarray:
        """
        Apply dynamic range compression to even out volume levels.
        
        Args:
            audio: Input audio array
            ratio: Compression ratio
            threshold_db: Compression threshold in dB
            
        Returns:
            Compressed audio
        """
        audio_float = audio.astype(np.float32) / 32767.0
        
        # Calculate envelope
        envelope = np.abs(audio_float)
        envelope = gaussian_filter1d(envelope, sigma=int(0.001 * self.sample_rate))
        
        # Convert to dB
        envelope_db = 20 * np.log10(np.maximum(envelope, 1e-10))
        
        # Calculate gain reduction
        gain_reduction = np.zeros_like(envelope_db)
        over_threshold = envelope_db > threshold_db
        gain_reduction[over_threshold] = (envelope_db[over_threshold] - threshold_db) / ratio
        
        # Convert back to linear
        gain_linear = 10**(-gain_reduction / 20)
        
        # Apply compression
        compressed = audio_float * gain_linear
        
        return (compressed * 32767).astype(np.int16)
    
    def process_audio(self, audio: np.ndarray, 
                     apply_compression: bool = False,
                     compression_ratio: float = 3.0) -> np.ndarray:
        """
        Apply full audio processing pipeline.
        
        Args:
            audio: Input audio array
            apply_compression: Whether to apply dynamic range compression
            compression_ratio: Compression ratio if compression is applied
            
        Returns:
            Processed audio
        """
        start_time = time.time()
        
        if len(audio) == 0:
            return audio
        
        processed_audio = audio.copy()
        
        try:
            # Apply noise gate
            if self.apply_noise_gate:
                processed_audio = self._apply_noise_gate(processed_audio)
            
            # Apply compression if requested
            if apply_compression:
                processed_audio = self._apply_dynamic_range_compression(
                    processed_audio, ratio=compression_ratio
                )
            
            # Normalize audio
            if self.normalize_audio:
                processed_audio = self._normalize_audio(processed_audio)
            
            processing_time = time.time() - start_time
            logger.debug(f"Audio processed in {processing_time:.3f}s")
            
            return processed_audio
            
        except Exception as e:
            logger.error(f"Audio processing failed: {e}")
            return audio  # Return original audio on failure
    
    def process_and_concatenate(self, audio_segments: List[np.ndarray],
                               apply_processing: bool = True) -> np.ndarray:
        """
        Process and concatenate multiple audio segments.
        
        Args:
            audio_segments: List of audio arrays
            apply_processing: Whether to apply full processing pipeline
            
        Returns:
            Processed and concatenated audio
        """
        if not audio_segments:
            return np.array([], dtype=np.int16)
        
        # First, crossfade the segments
        concatenated = self.crossfade_audio_segments(audio_segments)
        
        # Then apply processing if requested
        if apply_processing:
            concatenated = self.process_audio(concatenated)
        
        return concatenated
    
    def add_silence(self, audio: np.ndarray, 
                   start_silence: float = 0.1, 
                   end_silence: float = 0.1) -> np.ndarray:
        """
        Add silence padding to audio.
        
        Args:
            audio: Input audio array
            start_silence: Silence duration at start in seconds
            end_silence: Silence duration at end in seconds
            
        Returns:
            Audio with added silence
        """
        start_samples = int(start_silence * self.sample_rate)
        end_samples = int(end_silence * self.sample_rate)
        
        start_pad = np.zeros(start_samples, dtype=audio.dtype)
        end_pad = np.zeros(end_samples, dtype=audio.dtype)
        
        return np.concatenate([start_pad, audio, end_pad])
    
    def get_audio_stats(self, audio: np.ndarray) -> dict:
        """
        Get audio statistics for quality analysis.
        
        Args:
            audio: Audio array to analyze
            
        Returns:
            Dictionary of audio statistics
        """
        if len(audio) == 0:
            return {"error": "Empty audio"}
        
        audio_float = audio.astype(np.float32)
        
        return {
            "duration_seconds": len(audio) / self.sample_rate,
            "sample_count": len(audio),
            "peak_amplitude": np.max(np.abs(audio_float)),
            "rms_level": np.sqrt(np.mean(audio_float**2)),
            "dynamic_range_db": 20 * np.log10(np.max(np.abs(audio_float)) / 
                                             (np.sqrt(np.mean(audio_float**2)) + 1e-10)),
            "zero_crossings": np.sum(np.diff(np.signbit(audio_float))),
            "dc_offset": np.mean(audio_float)
        }