File size: 7,894 Bytes
b5df735 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 |
"""
Health Service
Provides health check functionality for the transcription service
"""
import os
import whisper
from pathlib import Path
from typing import Dict, Any
class HealthService:
"""Service for health checks and status monitoring"""
def get_health_status(self) -> Dict[str, Any]:
"""Get comprehensive health status of the service"""
# Check Whisper models
whisper_status = self._check_whisper_models()
# Check speaker diarization
speaker_status = self._check_speaker_diarization()
# Overall health
overall_health = "healthy" if (
whisper_status["status"] == "healthy" and
speaker_status["status"] in ["healthy", "partial"] # Speaker diarization is optional
) else "unhealthy"
return {
"status": overall_health,
"timestamp": self._get_current_timestamp(),
"whisper": whisper_status,
"speaker_diarization": speaker_status,
"version": "1.0.0"
}
def _check_whisper_models(self) -> Dict[str, Any]:
"""Check Whisper model availability"""
try:
# Check available models
available_models = whisper.available_models()
# Check if turbo model is available
default_model = "turbo"
# Check model cache directory
model_cache_dir = "/model"
cache_exists = os.path.exists(model_cache_dir)
# Try to load the default model
try:
if cache_exists:
model = whisper.load_model(default_model, download_root=model_cache_dir)
model_loaded = True
load_source = "cache"
else:
model = whisper.load_model(default_model)
model_loaded = True
load_source = "download"
except Exception as e:
model_loaded = False
load_source = f"failed: {e}"
return {
"status": "healthy" if model_loaded else "unhealthy",
"default_model": default_model,
"available_models": available_models,
"model_cache_exists": cache_exists,
"model_cache_directory": model_cache_dir if cache_exists else None,
"model_loaded": model_loaded,
"load_source": load_source,
"whisper_version": getattr(whisper, '__version__', 'unknown')
}
except Exception as e:
return {
"status": "unhealthy",
"error": str(e),
"default_model": "turbo",
"available_models": [],
"model_cache_exists": False,
"model_loaded": False
}
def _check_speaker_diarization(self) -> Dict[str, Any]:
"""Check speaker diarization functionality"""
try:
# Check if HF token is available
hf_token = os.environ.get("HF_TOKEN")
hf_token_available = hf_token is not None
# Check speaker model cache
speaker_cache_dir = "/model/speaker-diarization"
cache_exists = os.path.exists(speaker_cache_dir)
# Check config file
config_file = os.path.join(speaker_cache_dir, "config.json")
config_exists = os.path.exists(config_file)
# Try to load speaker diarization pipeline
pipeline_loaded = False
pipeline_error = None
if hf_token_available:
try:
from pyannote.audio import Pipeline
# Try to load pipeline
pipeline = Pipeline.from_pretrained(
"pyannote/speaker-diarization-3.1",
use_auth_token=hf_token
)
pipeline_loaded = True
except Exception as e:
pipeline_error = str(e)
else:
pipeline_error = "HF_TOKEN not available"
# Determine status
if pipeline_loaded:
status = "healthy"
elif hf_token_available:
status = "partial" # Token available but pipeline failed
else:
status = "disabled" # No token, feature disabled
return {
"status": status,
"hf_token_available": hf_token_available,
"speaker_cache_exists": cache_exists,
"speaker_cache_directory": speaker_cache_dir if cache_exists else None,
"config_exists": config_exists,
"pipeline_loaded": pipeline_loaded,
"pipeline_error": pipeline_error,
"model_name": "pyannote/speaker-diarization-3.1"
}
except Exception as e:
return {
"status": "unhealthy",
"error": str(e),
"hf_token_available": False,
"speaker_cache_exists": False,
"pipeline_loaded": False
}
def test_speaker_diarization(self, test_audio_path: str = None) -> Dict[str, Any]:
"""Test speaker diarization functionality with actual audio"""
try:
# Check if HF token is available
hf_token = os.environ.get("HF_TOKEN")
if not hf_token:
return {
"status": "skipped",
"reason": "HF_TOKEN not available"
}
# Load speaker diarization pipeline
from pyannote.audio import Pipeline
pipeline = Pipeline.from_pretrained(
"pyannote/speaker-diarization-3.1",
use_auth_token=hf_token
)
# If no test audio provided, return pipeline load success
if not test_audio_path:
return {
"status": "pipeline_loaded",
"message": "Speaker diarization pipeline loaded successfully"
}
# Test with actual audio file
if not os.path.exists(test_audio_path):
return {
"status": "failed",
"reason": f"Test audio file not found: {test_audio_path}"
}
# Run speaker diarization
diarization_result = pipeline(test_audio_path)
# Process results
speakers = set()
segments_count = 0
total_speech_duration = 0
for turn, _, speaker in diarization_result.itertracks(yield_label=True):
speakers.add(speaker)
segments_count += 1
total_speech_duration += turn.end - turn.start
return {
"status": "success",
"speakers_detected": len(speakers),
"segments_count": segments_count,
"total_speech_duration": total_speech_duration,
"test_audio_path": test_audio_path,
"speakers": list(speakers)
}
except Exception as e:
return {
"status": "failed",
"error": str(e),
"test_audio_path": test_audio_path
}
def _get_current_timestamp(self) -> str:
"""Get current timestamp in ISO format"""
from datetime import datetime
return datetime.utcnow().isoformat() + "Z" |