""" Modal Transcription Service - handles transcription via Modal endpoints Enhanced to replace transcription_tools.py functions with proper service architecture """ import asyncio import aiohttp import base64 import os from typing import Dict, Any from pathlib import Path class ModalTranscriptionService: """Service for audio transcription via Modal endpoints""" def __init__(self, endpoint_urls: Dict[str, str] = None, cache_dir: str = None, use_direct_modal_calls: bool = True): """ Initialize Modal transcription service Args: endpoint_urls: Dictionary of endpoint URLs (used when use_direct_modal_calls=False) cache_dir: Cache directory path use_direct_modal_calls: Whether to use direct Modal function calls or HTTP endpoints """ self.use_direct_modal_calls = use_direct_modal_calls self.endpoint_urls = endpoint_urls or { "transcribe_audio": "https://richardsucran--transcribe-audio-endpoint.modal.run", "transcribe_chunk": "https://richardsucran--transcribe-audio-chunk-endpoint.modal.run", "health_check": "https://richardsucran--health-check-endpoint.modal.run" } self.cache_dir = cache_dir or "/tmp" # Determine if we're running in Modal environment if self.use_direct_modal_calls: print("✅ Using direct function calls (no HTTP endpoints)") async def transcribe_audio_file( self, audio_file_path: str, model_size: str = "turbo", language: str = None, output_format: str = "srt", enable_speaker_diarization: bool = False, use_parallel_processing: bool = True, chunk_duration: int = 60, use_intelligent_segmentation: bool = True ) -> Dict[str, Any]: """ Transcribe audio file using Modal endpoints with intelligent processing Args: audio_file_path: Path to audio file model_size: Whisper model size language: Language code (None for auto-detect) output_format: Output format (srt, txt, json) enable_speaker_diarization: Whether to enable speaker diarization use_parallel_processing: Whether to use distributed processing chunk_duration: Duration of chunks for parallel processing use_intelligent_segmentation: Whether to use intelligent segmentation Returns: Transcription result dictionary """ try: # Validate input file if not os.path.exists(audio_file_path): return { "processing_status": "failed", "error_message": f"Audio file not found: {audio_file_path}" } # Read and encode audio file with open(audio_file_path, "rb") as f: audio_data = f.read() audio_base64 = base64.b64encode(audio_data).decode('utf-8') # Prepare request data request_data = { "audio_file_data": audio_base64, "audio_file_name": os.path.basename(audio_file_path), "model_size": model_size, "language": language, "output_format": output_format, "enable_speaker_diarization": enable_speaker_diarization, "use_parallel_processing": use_parallel_processing, "chunk_duration": chunk_duration, "use_intelligent_segmentation": use_intelligent_segmentation } endpoint_url = self.endpoint_urls["transcribe_audio"] print(f"🎤 Starting transcription via Modal {'function call' if self.use_direct_modal_calls else 'endpoint'}...") print(f" File: {audio_file_path}") print(f" Size: {len(audio_data) / (1024*1024):.2f} MB") print(f" Model: {model_size}") print(f" Parallel processing: {use_parallel_processing}") print(f" Intelligent segmentation: {use_intelligent_segmentation}") print(f" Speaker diarization: {enable_speaker_diarization}") # Choose between direct function call or HTTP endpoint if self.use_direct_modal_calls: # Direct function call (when running inside Modal environment) try: # Call the process_transcription_request method directly result = await self.process_transcription_request(request_data) except Exception as e: print(f"⚠️ Direct Modal call failed, falling back to HTTP: {e}") self.use_direct_modal_calls = False # Fall through to HTTP endpoint call else: print(f"✅ Transcription completed successfully via direct function call") self._log_transcription_results(result, enable_speaker_diarization) return result if not self.use_direct_modal_calls: # HTTP endpoint call (fallback) endpoint_url = self.endpoint_urls["transcribe_audio"] async with aiohttp.ClientSession() as session: async with session.post( endpoint_url, json=request_data, timeout=aiohttp.ClientTimeout(total=3600) # 1 hour timeout ) as response: if response.status == 200: result = await response.json() print(f"✅ Transcription completed successfully via HTTP endpoint") self._log_transcription_results(result, enable_speaker_diarization) return result else: error_text = await response.text() return { "processing_status": "failed", "error_message": f"HTTP {response.status}: {error_text}" } except Exception as e: return { "processing_status": "failed", "error_message": f"Transcription request failed: {e}" } async def transcribe_chunk( self, chunk_path: str, start_time: float, end_time: float, model_size: str = "turbo", language: str = None, enable_speaker_diarization: bool = False ) -> Dict[str, Any]: """ Transcribe a single audio chunk using Modal chunk endpoint Args: chunk_path: Path to audio chunk file start_time: Start time of chunk in original audio end_time: End time of chunk in original audio model_size: Whisper model size language: Language code enable_speaker_diarization: Whether to enable speaker diarization Returns: Transcription result for the chunk """ try: # Read and encode chunk file with open(chunk_path, "rb") as f: audio_data = f.read() audio_base64 = base64.b64encode(audio_data).decode('utf-8') # Prepare request data request_data = { "audio_file_data": audio_base64, "audio_file_name": os.path.basename(chunk_path), "model_size": model_size, "language": language, "output_format": "json", # Use JSON for easier merging "enable_speaker_diarization": enable_speaker_diarization, "chunk_start_time": start_time, "chunk_end_time": end_time } # Choose between direct function call or HTTP endpoint if self.use_direct_modal_calls: # Direct function call try: result = self.process_chunk_request(request_data) result["chunk_start_time"] = start_time result["chunk_end_time"] = end_time result["chunk_file"] = chunk_path return result except Exception as e: print(f"⚠️ Direct chunk call failed, falling back to HTTP: {e}") self.use_direct_modal_calls = False # Fall through to HTTP endpoint call if not self.use_direct_modal_calls: # HTTP endpoint call (fallback) endpoint_url = self.endpoint_urls["transcribe_chunk"] # Configure timeout with more granular controls # Adjust timeout based on speaker diarization if enable_speaker_diarization: timeout_config = aiohttp.ClientTimeout( total=720, # 12 minutes total for speaker diarization connect=45, # 45 seconds connection timeout sock_read=300 # 5 minutes read timeout for speaker processing ) else: timeout_config = aiohttp.ClientTimeout( total=480, # 8 minutes total for regular transcription connect=30, # 30 seconds connection timeout sock_read=120 # 2 minutes read timeout for regular processing ) async with aiohttp.ClientSession(timeout=timeout_config) as session: async with session.post( endpoint_url, json=request_data ) as response: if response.status == 200: result = await response.json() result["chunk_start_time"] = start_time result["chunk_end_time"] = end_time result["chunk_file"] = chunk_path return result else: error_text = await response.text() return { "processing_status": "failed", "error_message": f"HTTP {response.status}: {error_text}", "chunk_start_time": start_time, "chunk_end_time": end_time, "chunk_file": chunk_path } except Exception as e: return { "processing_status": "failed", "error_message": str(e), "chunk_start_time": start_time, "chunk_end_time": end_time, "chunk_file": chunk_path } async def check_endpoints_health(self) -> Dict[str, Any]: """ Check the health status of all Modal endpoints Returns: Health status dictionary for all endpoints """ health_status = {} async with aiohttp.ClientSession() as session: for endpoint_name, endpoint_url in self.endpoint_urls.items(): try: if endpoint_name == "health_check": # Health check endpoint supports GET async with session.get( endpoint_url, timeout=aiohttp.ClientTimeout(total=30) ) as response: if response.status == 200: response_data = await response.json() health_status[endpoint_name] = { "status": "healthy", "response": response_data, "url": endpoint_url } else: health_status[endpoint_name] = { "status": "unhealthy", "error": f"HTTP {response.status}", "url": endpoint_url } else: # Other endpoints are POST-only, just check if they're accessible async with session.get( endpoint_url, timeout=aiohttp.ClientTimeout(total=10) ) as response: # 405 Method Not Allowed is expected for POST-only endpoints if response.status == 405: health_status[endpoint_name] = { "status": "healthy", "response": "Endpoint accessible (POST-only)", "url": endpoint_url } else: health_status[endpoint_name] = { "status": "unknown", "response": f"HTTP {response.status}", "url": endpoint_url } except Exception as e: health_status[endpoint_name] = { "status": "error", "error": str(e), "url": endpoint_url } return health_status async def get_system_status(self) -> Dict[str, Any]: """ Get comprehensive system status including health checks Returns: System status dictionary """ try: endpoint_url = self.endpoint_urls["health_check"] async with aiohttp.ClientSession() as session: async with session.get( endpoint_url, timeout=aiohttp.ClientTimeout(total=30) ) as response: if response.status == 200: return await response.json() else: error_text = await response.text() return { "status": "failed", "error_message": f"HTTP {response.status}: {error_text}" } except Exception as e: return { "status": "failed", "error_message": f"Health check failed: {e}" } def get_endpoint_url(self, endpoint_name: str) -> str: """ Get URL for specific endpoint Args: endpoint_name: Name of the endpoint Returns: Endpoint URL """ return self.endpoint_urls.get(endpoint_name, f"https://richardsucran--{endpoint_name}.modal.run") # ==================== Modal Server-Side Methods ==================== # These methods are used by Modal endpoints running on the server async def process_transcription_request(self, request_data: Dict[str, Any]) -> Dict[str, Any]: """ Process transcription request on Modal server This method runs on the Modal server, not the client """ try: # Import services that are available on Modal server import sys import tempfile import base64 from pathlib import Path # Import local services (available on Modal server) from src.services.distributed_transcription_service import DistributedTranscriptionService from src.services.transcription_service import TranscriptionService # Extract request parameters audio_file_data = request_data.get("audio_file_data") audio_file_name = request_data.get("audio_file_name", "audio.mp3") model_size = request_data.get("model_size", "turbo") language = request_data.get("language") output_format = request_data.get("output_format", "srt") enable_speaker_diarization = request_data.get("enable_speaker_diarization", False) use_parallel_processing = request_data.get("use_parallel_processing", True) chunk_duration = request_data.get("chunk_duration", 60) use_intelligent_segmentation = request_data.get("use_intelligent_segmentation", True) if not audio_file_data: return { "processing_status": "failed", "error_message": "No audio data provided" } # Decode audio data and save to temporary file audio_bytes = base64.b64decode(audio_file_data) temp_dir = Path(self.cache_dir) temp_dir.mkdir(exist_ok=True) temp_audio_path = temp_dir / audio_file_name with open(temp_audio_path, "wb") as f: f.write(audio_bytes) print(f"🎤 Processing audio on Modal server: {audio_file_name}") print(f" Size: {len(audio_bytes) / (1024*1024):.2f} MB") print(f" Model: {model_size}") print(f" Parallel processing: {use_parallel_processing}") print(f" Intelligent segmentation: {use_intelligent_segmentation}") # Choose processing strategy based on file size and settings file_size_mb = len(audio_bytes) / (1024 * 1024) if use_parallel_processing and file_size_mb > 10: # Use distributed for files > 10MB print("🔄 Using distributed transcription service") service = DistributedTranscriptionService() result = await service.transcribe_audio_distributed( audio_file_path=str(temp_audio_path), model_size=model_size, language=language, output_format=output_format, enable_speaker_diarization=enable_speaker_diarization, chunk_duration=chunk_duration, use_intelligent_segmentation=use_intelligent_segmentation ) else: print("🎯 Using single transcription service") service = TranscriptionService() result = service.transcribe_audio( audio_file_path=str(temp_audio_path), model_size=model_size, language=language, output_format=output_format, enable_speaker_diarization=enable_speaker_diarization ) # Clean up temporary file try: temp_audio_path.unlink() except: pass print(f"✅ Transcription completed on Modal server") return result except Exception as e: print(f"❌ Error processing transcription request: {e}") return { "processing_status": "failed", "error_message": f"Server processing error: {str(e)}" } def process_chunk_request(self, request_data: Dict[str, Any]) -> Dict[str, Any]: """ Process chunk transcription request on Modal server This method runs on the Modal server, not the client """ try: # Import services that are available on Modal server import base64 import tempfile from pathlib import Path # Import local services (available on Modal server) from src.services.transcription_service import TranscriptionService # Extract request parameters audio_file_data = request_data.get("audio_file_data") audio_file_name = request_data.get("audio_file_name", "chunk.mp3") model_size = request_data.get("model_size", "turbo") language = request_data.get("language") enable_speaker_diarization = request_data.get("enable_speaker_diarization", False) chunk_start_time = request_data.get("chunk_start_time", 0) chunk_end_time = request_data.get("chunk_end_time", 0) if not audio_file_data: return { "processing_status": "failed", "error_message": "No audio data provided", "chunk_start_time": chunk_start_time, "chunk_end_time": chunk_end_time } # Decode audio data and save to temporary file audio_bytes = base64.b64decode(audio_file_data) temp_dir = Path(self.cache_dir) temp_dir.mkdir(exist_ok=True) temp_audio_path = temp_dir / audio_file_name with open(temp_audio_path, "wb") as f: f.write(audio_bytes) print(f"🎤 Processing chunk on Modal server: {audio_file_name}") print(f" Time range: {chunk_start_time:.2f}s - {chunk_end_time:.2f}s") print(f" Size: {len(audio_bytes) / (1024*1024):.2f} MB") # Use single transcription service for chunks service = TranscriptionService() result = service.transcribe_audio( audio_file_path=str(temp_audio_path), model_size=model_size, language=language, output_format="json", # Always use JSON for chunks enable_speaker_diarization=enable_speaker_diarization ) # Add chunk timing information if result.get("processing_status") == "success": result["chunk_start_time"] = chunk_start_time result["chunk_end_time"] = chunk_end_time result["chunk_file"] = audio_file_name # Clean up temporary file try: temp_audio_path.unlink() except: pass print(f"✅ Chunk transcription completed on Modal server") return result except Exception as e: print(f"❌ Error processing chunk request: {e}") return { "processing_status": "failed", "error_message": f"Server chunk processing error: {str(e)}", "chunk_start_time": request_data.get("chunk_start_time", 0), "chunk_end_time": request_data.get("chunk_end_time", 0) } def _log_transcription_results(self, result: Dict[str, Any], enable_speaker_diarization: bool = False): """ Log transcription results in a consistent format Args: result: Transcription result dictionary enable_speaker_diarization: Whether speaker diarization was enabled """ print(f" Processing type: {'Distributed' if result.get('distributed_processing', False) else 'Single'}") print(f" Segments: {result.get('segment_count', 0)}") print(f" Duration: {result.get('audio_duration', 0):.2f}s") print(f" Language: {result.get('language_detected', 'unknown')}") if result.get("distributed_processing", False): print(f" Chunks processed: {result.get('chunks_processed', 0)}") print(f" Chunks failed: {result.get('chunks_failed', 0)}") segmentation_type = result.get("segmentation_type", "time_based") print(f" Segmentation: {segmentation_type}") if enable_speaker_diarization: speaker_count = result.get("global_speaker_count", 0) print(f" Speakers detected: {speaker_count}")