import modal import os # Create Modal application app = modal.App(name="gradio-mcp-server") # Try to get Hugging Face token from Modal secrets (required for speaker diarization) try: hf_secret = modal.Secret.from_name("huggingface-secret") print("โœ… Found Hugging Face secret configuration") except Exception: hf_secret = None print("โš ๏ธ Hugging Face secret not found, speaker diarization will be disabled") # Create mounted volume volume = modal.Volume.from_name("cache-volume", create_if_missing=True) cache_dir = "/root/cache" # Model preloading function def download_models() -> None: """Download and cache Whisper and speaker diarization models""" import whisper import os from pathlib import Path # Create model cache directory model_cache_dir = Path("/model") model_cache_dir.mkdir(exist_ok=True) print("๐Ÿ“ฅ Downloading Whisper turbo model...") # Download and cache Whisper turbo model whisper_model = whisper.load_model("turbo", download_root="/model") print("โœ… Whisper turbo model downloaded and cached") # Download speaker diarization models if HF token is available if os.environ.get("HF_TOKEN"): try: print("๐Ÿ“ฅ Downloading speaker diarization models...") from pyannote.audio import Pipeline, Model from pyannote.audio.core.inference import Inference import torch # Set proper cache directory for pyannote os.environ["PYANNOTE_CACHE"] = "/model/speaker-diarization" # Download and cache speaker diarization pipeline # This will automatically cache to the PYANNOTE_CACHE directory pipeline = Pipeline.from_pretrained( "pyannote/speaker-diarization-3.1", use_auth_token=os.environ["HF_TOKEN"], cache_dir="/model/speaker-diarization" ) # Preload speaker embedding model for speaker identification print("๐Ÿ“ฅ Downloading speaker embedding model...") embedding_model = Model.from_pretrained( "pyannote/embedding", use_auth_token=os.environ["HF_TOKEN"], cache_dir="/model/speaker-embedding" ) # Set device for models device = torch.device("cuda" if torch.cuda.is_available() else "cpu") embedding_model.to(device) embedding_model.eval() # Create inference object for embedding extraction inference = Inference(embedding_model, window="whole") # Verify the pipeline works print("๐Ÿงช Testing speaker diarization pipeline...") # Create a simple marker file to indicate successful download import json speaker_dir = Path("/model/speaker-diarization") speaker_dir.mkdir(exist_ok=True, parents=True) embedding_dir = Path("/model/speaker-embedding") embedding_dir.mkdir(exist_ok=True, parents=True) config = { "model_name": "pyannote/speaker-diarization-3.1", "embedding_model_name": "pyannote/embedding", "cached_at": str(speaker_dir), "embedding_cached_at": str(embedding_dir), "cache_complete": True, "embedding_cache_complete": True, "pyannote_cache_env": "/model/speaker-diarization", "device": str(device) } with open(speaker_dir / "download_complete.json", "w") as f: json.dump(config, f) print("โœ… Speaker diarization and embedding models downloaded and cached") except Exception as e: print(f"โš ๏ธ Failed to download speaker diarization models: {e}") else: print("โš ๏ธ No HF_TOKEN found, skipping speaker diarization model download") # Create image environment with model preloading image = modal.Image.debian_slim(python_version="3.11").apt_install( # Basic tools "ffmpeg", "wget", "curl", "unzip", "gnupg2", "git", # Required by Whisper # Chrome dependencies "libglib2.0-0", "libnss3", "libatk-bridge2.0-0", "libdrm2", "libxkbcommon0", "libxcomposite1", "libxdamage1", "libxrandr2", "libgbm1", "libxss1", "libasound2" ).run_commands( # Download and install Chrome directly (faster method) "wget -q https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb", "apt-get install -y ./google-chrome-stable_current_amd64.deb || apt-get install -y -f", "rm google-chrome-stable_current_amd64.deb" ).pip_install( # Web frameworks and basic libraries "gradio>=5.31.0", "fastapi", "pydantic", "python-dotenv", # MCP related "mcp[cli]", "fastmcp>=2.7.0", "starlette", # Network and parsing "beautifulsoup4", "selenium", "requests", # Whisper and audio processing related "git+https://github.com/openai/whisper.git", "ffmpeg-python", "torchaudio==2.1.0", "numpy<2", # Audio processing dependencies "librosa", "soundfile", # Other Whisper ecosystem dependencies "dacite", "jiwer", "pandas", "loguru==0.6.0", # GraphQL client (if needed) "gql[all]~=3.0.0a5", # Speaker diarization related dependencies "pyannote.audio==3.1.0", # System monitoring "psutil", ).run_function( download_models, secrets=[hf_secret] if hf_secret else [] ) # Update file paths to reflect new structure image = image.add_local_dir("../src", remote_path="/root/src") secrets = [hf_secret] if hf_secret else [] # ==================== Modal Endpoints Configuration ==================== @app.function( image=image, volumes={cache_dir: volume}, cpu=4, # Increased CPU for better performance memory=8192, # 8GB memory for stable transcription gpu="A10G", timeout=1800, # 30 minutes timeout for speaker diarization support scaledown_window=40, # 15 minutes before scaling down secrets=secrets, ) @modal.fastapi_endpoint(method="POST", label="transcribe-audio-chunk-endpoint") def transcribe_audio_chunk_endpoint(request_data: dict): """FastAPI endpoint for transcribing a single audio chunk (for distributed processing)""" import sys sys.path.append('/root') from src.services.modal_transcription_service import ModalTranscriptionService modal_service = ModalTranscriptionService(cache_dir="/root/cache", use_direct_modal_calls=True) return modal_service.process_chunk_request(request_data) @app.function( image=image, cpu=2, # Increased CPU for better health check performance memory=2048, # 2GB memory for stability timeout=300, # 5 minutes timeout for health checks scaledown_window=600, # 10 minutes before scaling down secrets=secrets, ) @modal.fastapi_endpoint(method="GET", label="health-check-endpoint") def health_check_endpoint(): """Health check endpoint to verify service status""" import sys sys.path.append('/root') from src.services.health_service import HealthService health_service = HealthService() return health_service.get_health_status()