""" Transcription tools using the enhanced service architecture Updated to use ModalTranscriptionService for better separation of concerns """ import asyncio from typing import Dict, Any from ..services import ModalTranscriptionService # Global service instance for reuse _modal_transcription_service = None def _format_srt_time(seconds: float) -> str: """Format seconds to SRT time format (HH:MM:SS,mmm)""" hours = int(seconds // 3600) minutes = int((seconds % 3600) // 60) secs = int(seconds % 60) millisecs = int((seconds % 1) * 1000) return f"{hours:02d}:{minutes:02d}:{secs:02d},{millisecs:03d}" def get_modal_transcription_service() -> ModalTranscriptionService: """Get or create global ModalTranscriptionService instance""" global _modal_transcription_service if _modal_transcription_service is None: _modal_transcription_service = ModalTranscriptionService(use_direct_modal_calls=True) return _modal_transcription_service async def transcribe_audio_file_tool( audio_file_path: str, model_size: str = "turbo", # Default to turbo model language: str = None, output_format: str = "srt", enable_speaker_diarization: bool = False, use_parallel_processing: bool = True, # Enable parallel processing by default chunk_duration: int = 60, # 60 seconds chunks for parallel processing use_intelligent_segmentation: bool = True # Enable intelligent segmentation by default ) -> Dict[str, Any]: """ MCP tool function for audio transcription using Modal endpoints with intelligent processing Enhanced to save transcription results to local files Args: audio_file_path: Path to audio file model_size: Whisper model size (tiny, base, small, medium, large, turbo) language: Language code (e.g., 'en', 'zh', None for auto-detect) output_format: Output format (srt, txt, json) enable_speaker_diarization: Whether to enable speaker diarization use_parallel_processing: Whether to use distributed processing for long audio chunk_duration: Duration of each chunk in seconds for parallel processing use_intelligent_segmentation: Whether to use intelligent silence-based segmentation Returns: Transcription result dictionary with local file paths """ try: import os import pathlib service = get_modal_transcription_service() modal_result = await service.transcribe_audio_file( audio_file_path=audio_file_path, model_size=model_size, language=language, output_format=output_format, enable_speaker_diarization=enable_speaker_diarization, use_parallel_processing=use_parallel_processing, chunk_duration=chunk_duration, use_intelligent_segmentation=use_intelligent_segmentation ) # Check if transcription was successful if modal_result.get("processing_status") != "success": return modal_result # Debug: Print modal result structure print(f"🔍 Modal result keys: {list(modal_result.keys())}") print(f"🔍 Has text: {bool(modal_result.get('text'))}") print(f"🔍 Has segments: {bool(modal_result.get('segments'))}") if modal_result.get("segments"): print(f"🔍 Segments count: {len(modal_result['segments'])}") # Save transcription results to local files using storage config from ..utils.storage_config import get_storage_config storage_config = get_storage_config() base_name = pathlib.Path(audio_file_path).stem output_dir = storage_config.transcripts_dir saved_files = [] txt_file_path = None srt_file_path = None json_file_path = None # Generate SRT content if segments are available if modal_result.get("segments"): segments = modal_result["segments"] srt_content = "" for i, segment in enumerate(segments, 1): start_time = _format_srt_time(segment.get("start", 0)) end_time = _format_srt_time(segment.get("end", 0)) text = segment.get("text", "").strip() if text: if enable_speaker_diarization and segment.get("speaker"): text = f"[{segment['speaker']}] {text}" srt_content += f"{i}\n{start_time} --> {end_time}\n{text}\n\n" if srt_content: srt_file_path = output_dir / f"{base_name}.srt" with open(srt_file_path, 'w', encoding='utf-8') as f: f.write(srt_content) saved_files.append(str(srt_file_path)) print(f"💾 Saved SRT file: {srt_file_path}") # Generate TXT content if text is available if modal_result.get("text"): txt_file_path = output_dir / f"{base_name}.txt" with open(txt_file_path, 'w', encoding='utf-8') as f: f.write(modal_result["text"]) saved_files.append(str(txt_file_path)) print(f"💾 Saved TXT file: {txt_file_path}") # Save JSON file with full results (always save for debugging) import json json_file_path = output_dir / f"{base_name}.json" with open(json_file_path, 'w', encoding='utf-8') as f: json.dump(modal_result, f, indent=2, ensure_ascii=False) saved_files.append(str(json_file_path)) print(f"💾 Saved JSON file: {json_file_path}") # Warn if no text/segments found if not modal_result.get("segments") and not modal_result.get("text"): print("⚠️ Warning: No text or segments found in transcription result") # Update result with local file paths result = modal_result.copy() result["txt_file_path"] = str(txt_file_path) if txt_file_path else None result["srt_file_path"] = str(srt_file_path) if srt_file_path else None result["json_file_path"] = str(json_file_path) if json_file_path else None result["saved_files"] = saved_files result["local_files_saved"] = len(saved_files) print(f"✅ Transcription completed and saved {len(saved_files)} local files") return result except Exception as e: return { "processing_status": "failed", "error_message": f"Tool error: {str(e)}" } async def check_modal_endpoints_health() -> Dict[str, Any]: """ Check the health status of Modal endpoints Returns: Health status dictionary for all endpoints """ try: service = get_modal_transcription_service() return await service.check_endpoints_health() except Exception as e: return { "status": "failed", "error_message": f"Health check tool error: {str(e)}" } async def get_system_status() -> Dict[str, Any]: """ Get comprehensive system status including health checks Returns: System status dictionary """ try: service = get_modal_transcription_service() return await service.get_system_status() except Exception as e: return { "status": "failed", "error_message": f"System status tool error: {str(e)}" } def get_modal_endpoint_url(endpoint_label: str) -> str: """ Get Modal endpoint URL for given label Args: endpoint_label: Modal endpoint label Returns: Full endpoint URL """ try: service = get_modal_transcription_service() return service.get_endpoint_url(endpoint_label) except Exception as e: # Fallback to default URL pattern return f"https://richardsucran--{endpoint_label}.modal.run" # Note: Download functionality has been moved to download_tools.py # These functions are now implemented there using PodcastDownloadService for local downloads