File size: 12,624 Bytes
b163aa7
 
 
 
 
 
797f6a7
b163aa7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
"""
Audio Post-Processing Module
============================

Handles audio post-processing, optimization, and quality enhancement.
Implements cross-fading, noise reduction, and dynamic range optimization.
Optimized for Hugging Face Spaces deployment.
"""

import logging
import time
from typing import Tuple, List, Optional
import numpy as np
import scipy.signal
from scipy.ndimage import gaussian_filter1d

logger = logging.getLogger(__name__)


class AudioProcessor:
    """Advanced audio post-processor for TTS output optimization."""
    
    def __init__(self, 
                 crossfade_duration: float = 0.1,
                 sample_rate: int = 16000,
                 apply_noise_gate: bool = True,
                 normalize_audio: bool = True):
        """
        Initialize audio processor.
        
        Args:
            crossfade_duration: Duration of crossfade between chunks in seconds
            sample_rate: Audio sample rate
            apply_noise_gate: Whether to apply noise gating
            normalize_audio: Whether to normalize audio levels
        """
        self.crossfade_duration = crossfade_duration
        self.sample_rate = sample_rate
        self.apply_noise_gate = apply_noise_gate
        self.normalize_audio = normalize_audio
        
        # Calculate crossfade samples
        self.crossfade_samples = int(crossfade_duration * sample_rate)
        
        logger.info(f"AudioProcessor initialized with {crossfade_duration}s crossfade")
    
    def _create_crossfade_window(self, length: int) -> Tuple[np.ndarray, np.ndarray]:
        """
        Create crossfade windows for smooth transitions.
        
        Args:
            length: Length of crossfade in samples
            
        Returns:
            Tuple of (fade_out_window, fade_in_window)
        """
        # Use raised cosine (Hann) window for smooth transitions
        window = np.hanning(2 * length)
        fade_out = window[:length]
        fade_in = window[length:]
        
        return fade_out, fade_in
    
    def crossfade_audio_segments(self, audio_segments: List[np.ndarray]) -> np.ndarray:
        """
        Crossfade multiple audio segments for smooth concatenation.
        
        Args:
            audio_segments: List of audio arrays to concatenate
            
        Returns:
            Smoothly concatenated audio array
        """
        if not audio_segments:
            return np.array([], dtype=np.int16)
        
        if len(audio_segments) == 1:
            return audio_segments[0]
        
        logger.debug(f"Crossfading {len(audio_segments)} audio segments")
        
        # Start with the first segment
        result = audio_segments[0].astype(np.float32)
        
        for i in range(1, len(audio_segments)):
            current_segment = audio_segments[i].astype(np.float32)
            
            # Determine crossfade length (limited by segment lengths)
            fade_length = min(
                self.crossfade_samples,
                len(result) // 2,
                len(current_segment) // 2
            )
            
            if fade_length > 0:
                # Create crossfade windows
                fade_out, fade_in = self._create_crossfade_window(fade_length)
                
                # Apply crossfade
                # Fade out end of result
                result[-fade_length:] *= fade_out
                
                # Fade in beginning of current segment
                current_segment[:fade_length] *= fade_in
                
                # Overlap and add
                overlap = result[-fade_length:] + current_segment[:fade_length]
                
                # Concatenate: result (except overlapped part) + overlap + current (except overlapped part)
                result = np.concatenate([
                    result[:-fade_length],
                    overlap,
                    current_segment[fade_length:]
                ])
            else:
                # No crossfade possible, simple concatenation
                result = np.concatenate([result, current_segment])
        
        return result.astype(np.int16)
    
    def _apply_noise_gate(self, audio: np.ndarray, threshold_db: float = -40.0) -> np.ndarray:
        """
        Apply noise gate to reduce background noise.
        
        Args:
            audio: Input audio array
            threshold_db: Noise gate threshold in dB
            
        Returns:
            Noise-gated audio
        """
        # Convert to float for processing
        audio_float = audio.astype(np.float32)
        
        # Calculate RMS energy in sliding window
        window_size = int(0.01 * self.sample_rate)  # 10ms window
        
        if len(audio_float) < window_size:
            # For very short audio, return as-is
            return audio.astype(np.int16)
        
        # Pad audio for edge cases
        padded_audio = np.pad(audio_float, window_size//2, mode='reflect')
        
        # Calculate RMS energy
        rms = np.sqrt(np.convolve(padded_audio**2, 
                                  np.ones(window_size)/window_size, 
                                  mode='valid'))
        
        # Ensure rms has the same length as original audio
        if len(rms) != len(audio_float):
            # Resize to match original audio length
            from scipy.ndimage import zoom
            zoom_factor = len(audio_float) / len(rms)
            rms = zoom(rms, zoom_factor)
        
        # Convert to dB
        rms_db = 20 * np.log10(np.maximum(rms, 1e-10))
        
        # Create gate mask
        threshold_linear = 10**(threshold_db/20)
        gate_mask = (rms / np.max(rms)) > threshold_linear
        
        # Smooth the gate mask to avoid clicks
        gate_mask = gaussian_filter1d(gate_mask.astype(float), sigma=2)
        
        # Ensure gate_mask has the same length as audio
        if len(gate_mask) != len(audio_float):
            from scipy.ndimage import zoom
            zoom_factor = len(audio_float) / len(gate_mask)
            gate_mask = zoom(gate_mask, zoom_factor)
        
        # Apply gate
        gated_audio = audio_float * gate_mask
        
        return gated_audio.astype(np.int16)
    
    def _normalize_audio(self, audio: np.ndarray, target_peak: float = 0.95) -> np.ndarray:
        """
        Normalize audio to target peak level.
        
        Args:
            audio: Input audio array
            target_peak: Target peak level (0.0 to 1.0)
            
        Returns:
            Normalized audio
        """
        audio_float = audio.astype(np.float32)
        
        # Find current peak
        current_peak = np.max(np.abs(audio_float))
        
        if current_peak > 0:
            # Calculate scaling factor
            scale_factor = (target_peak * 32767) / current_peak
            
            # Apply scaling
            normalized = audio_float * scale_factor
            
            # Clip to prevent overflow
            normalized = np.clip(normalized, -32767, 32767)
            
            return normalized.astype(np.int16)
        
        return audio
    
    def _apply_dynamic_range_compression(self, audio: np.ndarray, 
                                        ratio: float = 4.0, 
                                        threshold_db: float = -12.0) -> np.ndarray:
        """
        Apply dynamic range compression to even out volume levels.
        
        Args:
            audio: Input audio array
            ratio: Compression ratio
            threshold_db: Compression threshold in dB
            
        Returns:
            Compressed audio
        """
        audio_float = audio.astype(np.float32) / 32767.0
        
        # Calculate envelope
        envelope = np.abs(audio_float)
        envelope = gaussian_filter1d(envelope, sigma=int(0.001 * self.sample_rate))
        
        # Convert to dB
        envelope_db = 20 * np.log10(np.maximum(envelope, 1e-10))
        
        # Calculate gain reduction
        gain_reduction = np.zeros_like(envelope_db)
        over_threshold = envelope_db > threshold_db
        gain_reduction[over_threshold] = (envelope_db[over_threshold] - threshold_db) / ratio
        
        # Convert back to linear
        gain_linear = 10**(-gain_reduction / 20)
        
        # Apply compression
        compressed = audio_float * gain_linear
        
        return (compressed * 32767).astype(np.int16)
    
    def process_audio(self, audio: np.ndarray, 
                     apply_compression: bool = False,
                     compression_ratio: float = 3.0) -> np.ndarray:
        """
        Apply full audio processing pipeline.
        
        Args:
            audio: Input audio array
            apply_compression: Whether to apply dynamic range compression
            compression_ratio: Compression ratio if compression is applied
            
        Returns:
            Processed audio
        """
        start_time = time.time()
        
        if len(audio) == 0:
            return audio
        
        processed_audio = audio.copy()
        
        try:
            # Apply noise gate
            if self.apply_noise_gate:
                processed_audio = self._apply_noise_gate(processed_audio)
            
            # Apply compression if requested
            if apply_compression:
                processed_audio = self._apply_dynamic_range_compression(
                    processed_audio, ratio=compression_ratio
                )
            
            # Normalize audio
            if self.normalize_audio:
                processed_audio = self._normalize_audio(processed_audio)
            
            processing_time = time.time() - start_time
            logger.debug(f"Audio processed in {processing_time:.3f}s")
            
            return processed_audio
            
        except Exception as e:
            logger.error(f"Audio processing failed: {e}")
            return audio  # Return original audio on failure
    
    def process_and_concatenate(self, audio_segments: List[np.ndarray],
                               apply_processing: bool = True) -> np.ndarray:
        """
        Process and concatenate multiple audio segments.
        
        Args:
            audio_segments: List of audio arrays
            apply_processing: Whether to apply full processing pipeline
            
        Returns:
            Processed and concatenated audio
        """
        if not audio_segments:
            return np.array([], dtype=np.int16)
        
        # First, crossfade the segments
        concatenated = self.crossfade_audio_segments(audio_segments)
        
        # Then apply processing if requested
        if apply_processing:
            concatenated = self.process_audio(concatenated)
        
        return concatenated
    
    def add_silence(self, audio: np.ndarray, 
                   start_silence: float = 0.1, 
                   end_silence: float = 0.1) -> np.ndarray:
        """
        Add silence padding to audio.
        
        Args:
            audio: Input audio array
            start_silence: Silence duration at start in seconds
            end_silence: Silence duration at end in seconds
            
        Returns:
            Audio with added silence
        """
        start_samples = int(start_silence * self.sample_rate)
        end_samples = int(end_silence * self.sample_rate)
        
        start_pad = np.zeros(start_samples, dtype=audio.dtype)
        end_pad = np.zeros(end_samples, dtype=audio.dtype)
        
        return np.concatenate([start_pad, audio, end_pad])
    
    def get_audio_stats(self, audio: np.ndarray) -> dict:
        """
        Get audio statistics for quality analysis.
        
        Args:
            audio: Audio array to analyze
            
        Returns:
            Dictionary of audio statistics
        """
        if len(audio) == 0:
            return {"error": "Empty audio"}
        
        audio_float = audio.astype(np.float32)
        
        return {
            "duration_seconds": len(audio) / self.sample_rate,
            "sample_count": len(audio),
            "peak_amplitude": np.max(np.abs(audio_float)),
            "rms_level": np.sqrt(np.mean(audio_float**2)),
            "dynamic_range_db": 20 * np.log10(np.max(np.abs(audio_float)) / 
                                             (np.sqrt(np.mean(audio_float**2)) + 1e-10)),
            "zero_crossings": np.sum(np.diff(np.signbit(audio_float))),
            "dc_offset": np.mean(audio_float)
        }