File size: 7,405 Bytes
b5df735
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
import modal
import os

# Create Modal application
app = modal.App(name="gradio-mcp-server")

# Try to get Hugging Face token from Modal secrets (required for speaker diarization)
try:
    hf_secret = modal.Secret.from_name("huggingface-secret")
    print("✅ Found Hugging Face secret configuration")
except Exception:
    hf_secret = None
    print("⚠️ Hugging Face secret not found, speaker diarization will be disabled")

# Create mounted volume
volume = modal.Volume.from_name("cache-volume", create_if_missing=True)
cache_dir = "/root/cache"

# Model preloading function
def download_models() -> None:
    """Download and cache Whisper and speaker diarization models"""
    import whisper
    import os
    from pathlib import Path
    
    # Create model cache directory
    model_cache_dir = Path("/model")
    model_cache_dir.mkdir(exist_ok=True)
    
    print("📥 Downloading Whisper turbo model...")
    # Download and cache Whisper turbo model
    whisper_model = whisper.load_model("turbo", download_root="/model")
    print("✅ Whisper turbo model downloaded and cached")
    
    # Download speaker diarization models if HF token is available
    if os.environ.get("HF_TOKEN"):
        try:
            print("📥 Downloading speaker diarization models...")
            from pyannote.audio import Pipeline, Model
            from pyannote.audio.core.inference import Inference
            import torch
            
            # Set proper cache directory for pyannote
            os.environ["PYANNOTE_CACHE"] = "/model/speaker-diarization"
            
            # Download and cache speaker diarization pipeline
            # This will automatically cache to the PYANNOTE_CACHE directory
            pipeline = Pipeline.from_pretrained(
                "pyannote/speaker-diarization-3.1",
                use_auth_token=os.environ["HF_TOKEN"],
                cache_dir="/model/speaker-diarization"
            )
            
            # Preload speaker embedding model for speaker identification
            print("📥 Downloading speaker embedding model...")
            embedding_model = Model.from_pretrained(
                "pyannote/embedding",
                use_auth_token=os.environ["HF_TOKEN"],
                cache_dir="/model/speaker-embedding"
            )
            
            # Set device for models
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            embedding_model.to(device)
            embedding_model.eval()
            
            # Create inference object for embedding extraction
            inference = Inference(embedding_model, window="whole")
            
            # Verify the pipeline works
            print("🧪 Testing speaker diarization pipeline...")
            
            # Create a simple marker file to indicate successful download
            import json
            speaker_dir = Path("/model/speaker-diarization")
            speaker_dir.mkdir(exist_ok=True, parents=True)
            
            embedding_dir = Path("/model/speaker-embedding")
            embedding_dir.mkdir(exist_ok=True, parents=True)
            
            config = {
                "model_name": "pyannote/speaker-diarization-3.1",
                "embedding_model_name": "pyannote/embedding",
                "cached_at": str(speaker_dir),
                "embedding_cached_at": str(embedding_dir),
                "cache_complete": True,
                "embedding_cache_complete": True,
                "pyannote_cache_env": "/model/speaker-diarization",
                "device": str(device)
            }
            with open(speaker_dir / "download_complete.json", "w") as f:
                json.dump(config, f)
            
            print("✅ Speaker diarization and embedding models downloaded and cached")
        except Exception as e:
            print(f"⚠️ Failed to download speaker diarization models: {e}")
    else:
        print("⚠️ No HF_TOKEN found, skipping speaker diarization model download")

# Create image environment with model preloading
image = modal.Image.debian_slim(python_version="3.11").apt_install(
    # Basic tools
    "ffmpeg",
    "wget",
    "curl",
    "unzip",
    "gnupg2",
    "git",  # Required by Whisper
    # Chrome dependencies
    "libglib2.0-0",
    "libnss3",
    "libatk-bridge2.0-0",
    "libdrm2",
    "libxkbcommon0",
    "libxcomposite1",
    "libxdamage1",
    "libxrandr2",
    "libgbm1",
    "libxss1",
    "libasound2"
).run_commands(
    # Download and install Chrome directly (faster method)
    "wget -q https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb",
    "apt-get install -y ./google-chrome-stable_current_amd64.deb || apt-get install -y -f",
    "rm google-chrome-stable_current_amd64.deb"
).pip_install(
    # Web frameworks and basic libraries
    "gradio>=5.31.0",
    "fastapi",
    "pydantic", 
    "python-dotenv",
    # MCP related
    "mcp[cli]",
    "fastmcp>=2.7.0",
    "starlette",
    # Network and parsing
    "beautifulsoup4",
    "selenium",
    "requests",
    # Whisper and audio processing related
    "git+https://github.com/openai/whisper.git",
    "ffmpeg-python",
    "torchaudio==2.1.0",
    "numpy<2",
    # Audio processing dependencies
    "librosa",
    "soundfile",
    # Other Whisper ecosystem dependencies
    "dacite",
    "jiwer",
    "pandas",
    "loguru==0.6.0",
    # GraphQL client (if needed)
    "gql[all]~=3.0.0a5",
    # Speaker diarization related dependencies
    "pyannote.audio==3.1.0",
    # System monitoring
    "psutil",
).run_function(
    download_models, 
    secrets=[hf_secret] if hf_secret else []
)

# Update file paths to reflect new structure
image = image.add_local_dir("../src", remote_path="/root/src")
secrets = [hf_secret] if hf_secret else []

# ==================== Modal Endpoints Configuration ====================

@app.function(
    image=image,
    volumes={cache_dir: volume},
    cpu=4,  # Increased CPU for better performance
    memory=8192,  # 8GB memory for stable transcription
    gpu="A10G",
    timeout=1800,  # 30 minutes timeout for speaker diarization support
    scaledown_window=40,  # 15 minutes before scaling down
    secrets=secrets,
)
@modal.fastapi_endpoint(method="POST", label="transcribe-audio-chunk-endpoint")
def transcribe_audio_chunk_endpoint(request_data: dict):
    """FastAPI endpoint for transcribing a single audio chunk (for distributed processing)"""
    import sys
    sys.path.append('/root')
    
    from src.services.modal_transcription_service import ModalTranscriptionService
    
    modal_service = ModalTranscriptionService(cache_dir="/root/cache", use_direct_modal_calls=True)
    return modal_service.process_chunk_request(request_data)

@app.function(
    image=image,
    cpu=2,  # Increased CPU for better health check performance
    memory=2048,  # 2GB memory for stability
    timeout=300,  # 5 minutes timeout for health checks
    scaledown_window=600,  # 10 minutes before scaling down
    secrets=secrets,
)
@modal.fastapi_endpoint(method="GET", label="health-check-endpoint")
def health_check_endpoint():
    """Health check endpoint to verify service status"""
    import sys
    sys.path.append('/root')
    
    from src.services.health_service import HealthService
    
    health_service = HealthService()
    return health_service.get_health_status()