File size: 4,860 Bytes
b5df735
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
"""
Modal transcription adapter for remote processing
"""

import requests
import base64
import pathlib
from typing import List, Optional

from ..interfaces.transcriber import ITranscriber, TranscriptionResult, TranscriptionSegment
from ..utils.config import AudioProcessingConfig
from ..utils.errors import TranscriptionError


class ModalTranscriptionAdapter(ITranscriber):
    """Adapter for Modal remote transcription processing"""
    
    def __init__(self, config: Optional[AudioProcessingConfig] = None, endpoint_url: Optional[str] = None):
        self.config = config or AudioProcessingConfig()
        self.endpoint_url = endpoint_url
    
    async def transcribe(
        self,
        audio_file_path: str,
        model_size: str = "turbo",
        language: Optional[str] = None,
        enable_speaker_diarization: bool = False
    ) -> TranscriptionResult:
        """Transcribe audio using Modal endpoint"""
        
        if not self.endpoint_url:
            raise TranscriptionError(
                "Modal endpoint URL not configured",
                model=model_size,
                audio_file=audio_file_path
            )
        
        try:
            # Read and encode audio file
            audio_path = pathlib.Path(audio_file_path)
            if not audio_path.exists():
                raise TranscriptionError(
                    f"Audio file not found: {audio_file_path}",
                    audio_file=audio_file_path
                )
            
            with open(audio_path, 'rb') as f:
                audio_data = f.read()
            
            audio_base64 = base64.b64encode(audio_data).decode('utf-8')
            
            # Prepare request data
            request_data = {
                "audio_file_data": audio_base64,
                "audio_file_name": audio_path.name,
                "model_size": model_size,
                "language": language,
                "output_format": "json",
                "enable_speaker_diarization": enable_speaker_diarization
            }
            
            print(f"πŸ”„ Sending transcription request to Modal endpoint")
            print(f"πŸ“ File: {audio_file_path} ({len(audio_data) / (1024*1024):.2f} MB)")
            print(f"πŸ”§ Model: {model_size}, Speaker diarization: {enable_speaker_diarization}")
            
            # Make request to Modal endpoint
            response = requests.post(
                self.endpoint_url,
                json=request_data,
                timeout=1800  # 30 minutes timeout
            )
            
            response.raise_for_status()
            result = response.json()
            
            print(f"βœ… Modal transcription completed")
            
            # Convert result to TranscriptionResult format
            return self._convert_modal_result(result)
            
        except requests.exceptions.RequestException as e:
            raise TranscriptionError(
                f"Failed to call Modal endpoint: {str(e)}",
                model=model_size,
                audio_file=audio_file_path
            )
        except Exception as e:
            raise TranscriptionError(
                f"Modal transcription failed: {str(e)}",
                model=model_size,
                audio_file=audio_file_path
            )
    
    def get_supported_models(self) -> List[str]:
        """Get list of supported model sizes"""
        return list(self.config.whisper_models.keys())
    
    def get_supported_languages(self) -> List[str]:
        """Get list of supported language codes"""
        return ["en", "zh", "ja", "ko", "es", "fr", "de", "ru", "auto"]
    
    def _convert_modal_result(self, modal_result: dict) -> TranscriptionResult:
        """Convert Modal result format to TranscriptionResult"""
        
        # Extract segments if available
        segments = []
        if "segments" in modal_result:
            for seg in modal_result["segments"]:
                segments.append(TranscriptionSegment(
                    start=seg.get("start", 0),
                    end=seg.get("end", 0),
                    text=seg.get("text", ""),
                    speaker=seg.get("speaker")
                ))
        
        return TranscriptionResult(
            text=modal_result.get("text", ""),
            segments=segments,
            language=modal_result.get("language_detected", "unknown"),
            model_used=modal_result.get("model_used", "unknown"),
            audio_duration=modal_result.get("audio_duration", 0),
            processing_time=modal_result.get("processing_time", 0),
            speaker_diarization_enabled=modal_result.get("speaker_diarization_enabled", False),
            global_speaker_count=modal_result.get("global_speaker_count", 0),
            error_message=modal_result.get("error_message")
        )