|
""" |
|
Health Service |
|
Provides health check functionality for the transcription service |
|
""" |
|
|
|
import os |
|
import whisper |
|
from pathlib import Path |
|
from typing import Dict, Any |
|
|
|
|
|
class HealthService: |
|
"""Service for health checks and status monitoring""" |
|
|
|
def get_health_status(self) -> Dict[str, Any]: |
|
"""Get comprehensive health status of the service""" |
|
|
|
|
|
whisper_status = self._check_whisper_models() |
|
|
|
|
|
speaker_status = self._check_speaker_diarization() |
|
|
|
|
|
overall_health = "healthy" if ( |
|
whisper_status["status"] == "healthy" and |
|
speaker_status["status"] in ["healthy", "partial"] |
|
) else "unhealthy" |
|
|
|
return { |
|
"status": overall_health, |
|
"timestamp": self._get_current_timestamp(), |
|
"whisper": whisper_status, |
|
"speaker_diarization": speaker_status, |
|
"version": "1.0.0" |
|
} |
|
|
|
def _check_whisper_models(self) -> Dict[str, Any]: |
|
"""Check Whisper model availability""" |
|
try: |
|
|
|
available_models = whisper.available_models() |
|
|
|
|
|
default_model = "turbo" |
|
|
|
|
|
model_cache_dir = "/model" |
|
cache_exists = os.path.exists(model_cache_dir) |
|
|
|
|
|
try: |
|
if cache_exists: |
|
model = whisper.load_model(default_model, download_root=model_cache_dir) |
|
model_loaded = True |
|
load_source = "cache" |
|
else: |
|
model = whisper.load_model(default_model) |
|
model_loaded = True |
|
load_source = "download" |
|
except Exception as e: |
|
model_loaded = False |
|
load_source = f"failed: {e}" |
|
|
|
return { |
|
"status": "healthy" if model_loaded else "unhealthy", |
|
"default_model": default_model, |
|
"available_models": available_models, |
|
"model_cache_exists": cache_exists, |
|
"model_cache_directory": model_cache_dir if cache_exists else None, |
|
"model_loaded": model_loaded, |
|
"load_source": load_source, |
|
"whisper_version": getattr(whisper, '__version__', 'unknown') |
|
} |
|
|
|
except Exception as e: |
|
return { |
|
"status": "unhealthy", |
|
"error": str(e), |
|
"default_model": "turbo", |
|
"available_models": [], |
|
"model_cache_exists": False, |
|
"model_loaded": False |
|
} |
|
|
|
def _check_speaker_diarization(self) -> Dict[str, Any]: |
|
"""Check speaker diarization functionality""" |
|
try: |
|
|
|
hf_token = os.environ.get("HF_TOKEN") |
|
hf_token_available = hf_token is not None |
|
|
|
|
|
speaker_cache_dir = "/model/speaker-diarization" |
|
cache_exists = os.path.exists(speaker_cache_dir) |
|
|
|
|
|
config_file = os.path.join(speaker_cache_dir, "config.json") |
|
config_exists = os.path.exists(config_file) |
|
|
|
|
|
pipeline_loaded = False |
|
pipeline_error = None |
|
|
|
if hf_token_available: |
|
try: |
|
from pyannote.audio import Pipeline |
|
|
|
|
|
pipeline = Pipeline.from_pretrained( |
|
"pyannote/speaker-diarization-3.1", |
|
use_auth_token=hf_token |
|
) |
|
pipeline_loaded = True |
|
|
|
except Exception as e: |
|
pipeline_error = str(e) |
|
else: |
|
pipeline_error = "HF_TOKEN not available" |
|
|
|
|
|
if pipeline_loaded: |
|
status = "healthy" |
|
elif hf_token_available: |
|
status = "partial" |
|
else: |
|
status = "disabled" |
|
|
|
return { |
|
"status": status, |
|
"hf_token_available": hf_token_available, |
|
"speaker_cache_exists": cache_exists, |
|
"speaker_cache_directory": speaker_cache_dir if cache_exists else None, |
|
"config_exists": config_exists, |
|
"pipeline_loaded": pipeline_loaded, |
|
"pipeline_error": pipeline_error, |
|
"model_name": "pyannote/speaker-diarization-3.1" |
|
} |
|
|
|
except Exception as e: |
|
return { |
|
"status": "unhealthy", |
|
"error": str(e), |
|
"hf_token_available": False, |
|
"speaker_cache_exists": False, |
|
"pipeline_loaded": False |
|
} |
|
|
|
def test_speaker_diarization(self, test_audio_path: str = None) -> Dict[str, Any]: |
|
"""Test speaker diarization functionality with actual audio""" |
|
try: |
|
|
|
hf_token = os.environ.get("HF_TOKEN") |
|
if not hf_token: |
|
return { |
|
"status": "skipped", |
|
"reason": "HF_TOKEN not available" |
|
} |
|
|
|
|
|
from pyannote.audio import Pipeline |
|
|
|
pipeline = Pipeline.from_pretrained( |
|
"pyannote/speaker-diarization-3.1", |
|
use_auth_token=hf_token |
|
) |
|
|
|
|
|
if not test_audio_path: |
|
return { |
|
"status": "pipeline_loaded", |
|
"message": "Speaker diarization pipeline loaded successfully" |
|
} |
|
|
|
|
|
if not os.path.exists(test_audio_path): |
|
return { |
|
"status": "failed", |
|
"reason": f"Test audio file not found: {test_audio_path}" |
|
} |
|
|
|
|
|
diarization_result = pipeline(test_audio_path) |
|
|
|
|
|
speakers = set() |
|
segments_count = 0 |
|
total_speech_duration = 0 |
|
|
|
for turn, _, speaker in diarization_result.itertracks(yield_label=True): |
|
speakers.add(speaker) |
|
segments_count += 1 |
|
total_speech_duration += turn.end - turn.start |
|
|
|
return { |
|
"status": "success", |
|
"speakers_detected": len(speakers), |
|
"segments_count": segments_count, |
|
"total_speech_duration": total_speech_duration, |
|
"test_audio_path": test_audio_path, |
|
"speakers": list(speakers) |
|
} |
|
|
|
except Exception as e: |
|
return { |
|
"status": "failed", |
|
"error": str(e), |
|
"test_audio_path": test_audio_path |
|
} |
|
|
|
def _get_current_timestamp(self) -> str: |
|
"""Get current timestamp in ISO format""" |
|
from datetime import datetime |
|
return datetime.utcnow().isoformat() + "Z" |