Spaces:

Agents-MCP-Hackathon
/

ModalTranscriberMCP

Running

File size: 3,908 Bytes

b5df735

"""
Local Whisper transcriber implementation
"""

import whisper
import torch
import pathlib
import time
from typing import Optional, List

from ..interfaces.transcriber import ITranscriber, TranscriptionResult, TranscriptionSegment
from ..utils.config import AudioProcessingConfig
from ..utils.errors import TranscriptionError, ModelLoadError


class WhisperTranscriber(ITranscriber):
    """Local Whisper transcriber implementation"""
    
    def __init__(self, config: Optional[AudioProcessingConfig] = None):
        self.config = config or AudioProcessingConfig()
        self.model_cache = {}
        self.device = self._setup_device()
    
    def _setup_device(self) -> str:
        """Setup and return the best available device"""
        if torch.cuda.is_available():
            return "cuda"
        else:
            return "cpu"
    
    async def transcribe(
        self,
        audio_file_path: str,
        model_size: str = "turbo",
        language: Optional[str] = None,
        enable_speaker_diarization: bool = False
    ) -> TranscriptionResult:
        """Transcribe audio using local Whisper model"""
        
        try:
            # Validate audio file
            audio_path = pathlib.Path(audio_file_path)
            if not audio_path.exists():
                raise TranscriptionError(
                    f"Audio file not found: {audio_file_path}",
                    audio_file=audio_file_path
                )
            
            # Load model
            model = self._load_model(model_size)
            
            # Transcribe
            start_time = time.time()
            result = model.transcribe(
                str(audio_path),
                language=language,
                verbose=False
            )
            processing_time = time.time() - start_time
            
            # Convert to our format
            segments = []
            for seg in result.get("segments", []):
                segments.append(TranscriptionSegment(
                    start=seg["start"],
                    end=seg["end"],
                    text=seg["text"].strip(),
                    confidence=seg.get("avg_logprob")
                ))
            
            return TranscriptionResult(
                text=result.get("text", "").strip(),
                segments=segments,
                language=result.get("language", "unknown"),
                model_used=model_size,
                audio_duration=result.get("duration", 0),
                processing_time=processing_time,
                speaker_diarization_enabled=enable_speaker_diarization,
                global_speaker_count=0,
                error_message=None
            )
            
        except Exception as e:
            raise TranscriptionError(
                f"Whisper transcription failed: {str(e)}",
                model=model_size,
                audio_file=audio_file_path
            )
    
    def _load_model(self, model_size: str):
        """Load Whisper model with caching"""
        if model_size not in self.model_cache:
            try:
                print(f"📥 Loading Whisper model: {model_size}")
                self.model_cache[model_size] = whisper.load_model(
                    model_size,
                    device=self.device
                )
            except Exception as e:
                raise ModelLoadError(
                    f"Failed to load model {model_size}: {str(e)}",
                    model_name=model_size
                )
        
        return self.model_cache[model_size]
    
    def get_supported_models(self) -> List[str]:
        """Get list of supported model sizes"""
        return list(self.config.whisper_models.keys())
    
    def get_supported_languages(self) -> List[str]:
        """Get list of supported language codes"""
        return ["en", "zh", "ja", "ko", "es", "fr", "de", "ru", "auto"]