Spaces:

BricksDisplay
/

OuteTTS-Speaker-Creator

Running on Zero

File size: 4,908 Bytes

ca494e8

"""
Alias module to redirect whisper imports to whisperx.
This allows OuteTTS to use whisperx instead of the standard whisper package.
"""

import sys
import importlib.util

def setup_whisper_alias():
    """Setup alias so that 'import whisper' uses whisperx instead."""
    try:
        # Check if whisperx is available
        whisperx_spec = importlib.util.find_spec("whisperx")
        if whisperx_spec is None:
            print("Warning: whisperx not found, falling back to regular whisper")
            return
        
        # Import whisperx
        import whisperx
        
        # Create a module wrapper that provides whisper-like interface
        class WhisperAlias:
            def __init__(self):
                self.model = whisperx.WhisperModel if hasattr(whisperx, 'WhisperModel') else None
                self.load_model = self._load_model
                
            def _load_model(self, name, **kwargs):
                """Load model with whisperx compatible interface."""
                # Create WhisperX model instance
                device = "cuda" if kwargs.get("device", "auto") == "cuda" else "cpu"
                compute_type = "float16" if device == "cuda" else "int8"
                
                model = whisperx.load_model(
                    name,
                    device=device,
                    compute_type=compute_type
                )
                
                return WhisperXModelWrapper(model, device)
        
        class WhisperXModelWrapper:
            """Wrapper to make whisperx compatible with whisper interface."""
            
            def __init__(self, model, device):
                self.model = model
                self.device = device
                
            def transcribe(self, audio, **kwargs):
                """Transcribe audio with whisper-compatible interface."""
                # Store original word_timestamps setting
                original_word_timestamps = kwargs.get('word_timestamps', False)
                
                # Load audio if it's a file path
                if isinstance(audio, str):
                    audio_data = whisperx.load_audio(audio)
                else:
                    audio_data = audio
                
                # Use whisperx's transcribe method
                batch_size = kwargs.get('batch_size', 16)
                result = self.model.transcribe(audio_data, batch_size=batch_size)
                
                # If word timestamps are requested, perform alignment
                if original_word_timestamps and result.get("segments"):
                    try:
                        # Load alignment model
                        model_a, metadata = whisperx.load_align_model(
                            language_code=result.get("language", "en"),
                            device=self.device
                        )
                        
                        # Align the segments
                        result = whisperx.align(
                            result["segments"],
                            model_a,
                            metadata,
                            audio_data,
                            self.device,
                            return_char_alignments=False
                        )
                    except Exception as e:
                        print(f"Warning: Could not perform alignment: {e}")
                        # Continue without alignment
                
                # Ensure result format is compatible with whisper format
                if "segments" not in result:
                    result["segments"] = []
                
                # Ensure 'text' field exists - concatenate all segment texts
                if "text" not in result:
                    result["text"] = " ".join([segment.get("text", "") for segment in result.get("segments", [])])
                
                # Add words field to segments if word timestamps were requested
                for segment in result.get("segments", []):
                    if original_word_timestamps and "words" not in segment:
                        # If we don't have words but they were requested, create empty words list
                        segment["words"] = []
                
                return result
        
        # Create the alias module
        whisper_alias = WhisperAlias()
        
        # Add to sys.modules so 'import whisper' uses our alias
        sys.modules['whisper'] = whisper_alias
        
        print("✅ Successfully aliased whisper to whisperx")
        
    except ImportError as e:
        print(f"Warning: Could not setup whisper alias: {e}")
        print("Falling back to regular whisper (if available)")
    except Exception as e:
        print(f"Warning: Error setting up whisper alias: {e}")

# Auto-setup when module is imported
setup_whisper_alias()