|
import modal |
|
import os |
|
|
|
|
|
app = modal.App(name="gradio-mcp-server") |
|
|
|
|
|
try: |
|
hf_secret = modal.Secret.from_name("huggingface-secret") |
|
print("✅ Found Hugging Face secret configuration") |
|
except Exception: |
|
hf_secret = None |
|
print("⚠️ Hugging Face secret not found, speaker diarization will be disabled") |
|
|
|
|
|
volume = modal.Volume.from_name("cache-volume", create_if_missing=True) |
|
cache_dir = "/root/cache" |
|
|
|
|
|
def download_models() -> None: |
|
"""Download and cache Whisper and speaker diarization models""" |
|
import whisper |
|
import os |
|
from pathlib import Path |
|
|
|
|
|
model_cache_dir = Path("/model") |
|
model_cache_dir.mkdir(exist_ok=True) |
|
|
|
print("📥 Downloading Whisper turbo model...") |
|
|
|
whisper_model = whisper.load_model("turbo", download_root="/model") |
|
print("✅ Whisper turbo model downloaded and cached") |
|
|
|
|
|
if os.environ.get("HF_TOKEN"): |
|
try: |
|
print("📥 Downloading speaker diarization models...") |
|
from pyannote.audio import Pipeline, Model |
|
from pyannote.audio.core.inference import Inference |
|
import torch |
|
|
|
|
|
os.environ["PYANNOTE_CACHE"] = "/model/speaker-diarization" |
|
|
|
|
|
|
|
pipeline = Pipeline.from_pretrained( |
|
"pyannote/speaker-diarization-3.1", |
|
use_auth_token=os.environ["HF_TOKEN"], |
|
cache_dir="/model/speaker-diarization" |
|
) |
|
|
|
|
|
print("📥 Downloading speaker embedding model...") |
|
embedding_model = Model.from_pretrained( |
|
"pyannote/embedding", |
|
use_auth_token=os.environ["HF_TOKEN"], |
|
cache_dir="/model/speaker-embedding" |
|
) |
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
embedding_model.to(device) |
|
embedding_model.eval() |
|
|
|
|
|
inference = Inference(embedding_model, window="whole") |
|
|
|
|
|
print("🧪 Testing speaker diarization pipeline...") |
|
|
|
|
|
import json |
|
speaker_dir = Path("/model/speaker-diarization") |
|
speaker_dir.mkdir(exist_ok=True, parents=True) |
|
|
|
embedding_dir = Path("/model/speaker-embedding") |
|
embedding_dir.mkdir(exist_ok=True, parents=True) |
|
|
|
config = { |
|
"model_name": "pyannote/speaker-diarization-3.1", |
|
"embedding_model_name": "pyannote/embedding", |
|
"cached_at": str(speaker_dir), |
|
"embedding_cached_at": str(embedding_dir), |
|
"cache_complete": True, |
|
"embedding_cache_complete": True, |
|
"pyannote_cache_env": "/model/speaker-diarization", |
|
"device": str(device) |
|
} |
|
with open(speaker_dir / "download_complete.json", "w") as f: |
|
json.dump(config, f) |
|
|
|
print("✅ Speaker diarization and embedding models downloaded and cached") |
|
except Exception as e: |
|
print(f"⚠️ Failed to download speaker diarization models: {e}") |
|
else: |
|
print("⚠️ No HF_TOKEN found, skipping speaker diarization model download") |
|
|
|
|
|
image = modal.Image.debian_slim(python_version="3.11").apt_install( |
|
|
|
"ffmpeg", |
|
"wget", |
|
"curl", |
|
"unzip", |
|
"gnupg2", |
|
"git", |
|
|
|
"libglib2.0-0", |
|
"libnss3", |
|
"libatk-bridge2.0-0", |
|
"libdrm2", |
|
"libxkbcommon0", |
|
"libxcomposite1", |
|
"libxdamage1", |
|
"libxrandr2", |
|
"libgbm1", |
|
"libxss1", |
|
"libasound2" |
|
).run_commands( |
|
|
|
"wget -q https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb", |
|
"apt-get install -y ./google-chrome-stable_current_amd64.deb || apt-get install -y -f", |
|
"rm google-chrome-stable_current_amd64.deb" |
|
).pip_install( |
|
|
|
"gradio>=5.31.0", |
|
"fastapi", |
|
"pydantic", |
|
"python-dotenv", |
|
|
|
"mcp[cli]", |
|
"fastmcp>=2.7.0", |
|
"starlette", |
|
|
|
"beautifulsoup4", |
|
"selenium", |
|
"requests", |
|
|
|
"git+https://github.com/openai/whisper.git", |
|
"ffmpeg-python", |
|
"torchaudio==2.1.0", |
|
"numpy<2", |
|
|
|
"librosa", |
|
"soundfile", |
|
|
|
"dacite", |
|
"jiwer", |
|
"pandas", |
|
"loguru==0.6.0", |
|
|
|
"gql[all]~=3.0.0a5", |
|
|
|
"pyannote.audio==3.1.0", |
|
|
|
"psutil", |
|
).run_function( |
|
download_models, |
|
secrets=[hf_secret] if hf_secret else [] |
|
) |
|
|
|
|
|
image = image.add_local_dir("../src", remote_path="/root/src") |
|
secrets = [hf_secret] if hf_secret else [] |
|
|
|
|
|
|
|
@app.function( |
|
image=image, |
|
volumes={cache_dir: volume}, |
|
cpu=4, |
|
memory=8192, |
|
gpu="A10G", |
|
timeout=1800, |
|
scaledown_window=40, |
|
secrets=secrets, |
|
) |
|
@modal.fastapi_endpoint(method="POST", label="transcribe-audio-chunk-endpoint") |
|
def transcribe_audio_chunk_endpoint(request_data: dict): |
|
"""FastAPI endpoint for transcribing a single audio chunk (for distributed processing)""" |
|
import sys |
|
sys.path.append('/root') |
|
|
|
from src.services.modal_transcription_service import ModalTranscriptionService |
|
|
|
modal_service = ModalTranscriptionService(cache_dir="/root/cache", use_direct_modal_calls=True) |
|
return modal_service.process_chunk_request(request_data) |
|
|
|
@app.function( |
|
image=image, |
|
cpu=2, |
|
memory=2048, |
|
timeout=300, |
|
scaledown_window=600, |
|
secrets=secrets, |
|
) |
|
@modal.fastapi_endpoint(method="GET", label="health-check-endpoint") |
|
def health_check_endpoint(): |
|
"""Health check endpoint to verify service status""" |
|
import sys |
|
sys.path.append('/root') |
|
|
|
from src.services.health_service import HealthService |
|
|
|
health_service = HealthService() |
|
return health_service.get_health_status() |
|
|
|
|
|
|