Spaces:

Agents-MCP-Hackathon
/

ModalTranscriberMCP

Running

File size: 4,860 Bytes

b5df735

"""
Modal transcription adapter for remote processing
"""

import requests
import base64
import pathlib
from typing import List, Optional

from ..interfaces.transcriber import ITranscriber, TranscriptionResult, TranscriptionSegment
from ..utils.config import AudioProcessingConfig
from ..utils.errors import TranscriptionError


class ModalTranscriptionAdapter(ITranscriber):
    """Adapter for Modal remote transcription processing"""
    
    def __init__(self, config: Optional[AudioProcessingConfig] = None, endpoint_url: Optional[str] = None):
        self.config = config or AudioProcessingConfig()
        self.endpoint_url = endpoint_url
    
    async def transcribe(
        self,
        audio_file_path: str,
        model_size: str = "turbo",
        language: Optional[str] = None,
        enable_speaker_diarization: bool = False
    ) -> TranscriptionResult:
        """Transcribe audio using Modal endpoint"""
        
        if not self.endpoint_url:
            raise TranscriptionError(
                "Modal endpoint URL not configured",
                model=model_size,
                audio_file=audio_file_path
            )
        
        try:
            # Read and encode audio file
            audio_path = pathlib.Path(audio_file_path)
            if not audio_path.exists():
                raise TranscriptionError(
                    f"Audio file not found: {audio_file_path}",
                    audio_file=audio_file_path
                )
            
            with open(audio_path, 'rb') as f:
                audio_data = f.read()
            
            audio_base64 = base64.b64encode(audio_data).decode('utf-8')
            
            # Prepare request data
            request_data = {
                "audio_file_data": audio_base64,
                "audio_file_name": audio_path.name,
                "model_size": model_size,
                "language": language,
                "output_format": "json",
                "enable_speaker_diarization": enable_speaker_diarization
            }
            
            print(f"🔄 Sending transcription request to Modal endpoint")
            print(f"📁 File: {audio_file_path} ({len(audio_data) / (1024*1024):.2f} MB)")
            print(f"🔧 Model: {model_size}, Speaker diarization: {enable_speaker_diarization}")
            
            # Make request to Modal endpoint
            response = requests.post(
                self.endpoint_url,
                json=request_data,
                timeout=1800  # 30 minutes timeout
            )
            
            response.raise_for_status()
            result = response.json()
            
            print(f"✅ Modal transcription completed")
            
            # Convert result to TranscriptionResult format
            return self._convert_modal_result(result)
            
        except requests.exceptions.RequestException as e:
            raise TranscriptionError(
                f"Failed to call Modal endpoint: {str(e)}",
                model=model_size,
                audio_file=audio_file_path
            )
        except Exception as e:
            raise TranscriptionError(
                f"Modal transcription failed: {str(e)}",
                model=model_size,
                audio_file=audio_file_path
            )
    
    def get_supported_models(self) -> List[str]:
        """Get list of supported model sizes"""
        return list(self.config.whisper_models.keys())
    
    def get_supported_languages(self) -> List[str]:
        """Get list of supported language codes"""
        return ["en", "zh", "ja", "ko", "es", "fr", "de", "ru", "auto"]
    
    def _convert_modal_result(self, modal_result: dict) -> TranscriptionResult:
        """Convert Modal result format to TranscriptionResult"""
        
        # Extract segments if available
        segments = []
        if "segments" in modal_result:
            for seg in modal_result["segments"]:
                segments.append(TranscriptionSegment(
                    start=seg.get("start", 0),
                    end=seg.get("end", 0),
                    text=seg.get("text", ""),
                    speaker=seg.get("speaker")
                ))
        
        return TranscriptionResult(
            text=modal_result.get("text", ""),
            segments=segments,
            language=modal_result.get("language_detected", "unknown"),
            model_used=modal_result.get("model_used", "unknown"),
            audio_duration=modal_result.get("audio_duration", 0),
            processing_time=modal_result.get("processing_time", 0),
            speaker_diarization_enabled=modal_result.get("speaker_diarization_enabled", False),
            global_speaker_count=modal_result.get("global_speaker_count", 0),
            error_message=modal_result.get("error_message")
        )