Spaces:

Agents-MCP-Hackathon
/

ModalTranscriberMCP

Running

App Files Files Community

ModalTranscriberMCP / src /core /whisper_transcriber.py

richard-su

Upload folder using huggingface_hub

b5df735 verified 8 days ago

raw

history blame

3.91 kB

	"""
	Local Whisper transcriber implementation
	"""

	import whisper
	import torch
	import pathlib
	import time
	from typing import Optional, List

	from ..interfaces.transcriber import ITranscriber, TranscriptionResult, TranscriptionSegment
	from ..utils.config import AudioProcessingConfig
	from ..utils.errors import TranscriptionError, ModelLoadError


	class WhisperTranscriber(ITranscriber):
	"""Local Whisper transcriber implementation"""

	def __init__(self, config: Optional[AudioProcessingConfig] = None):
	self.config = config or AudioProcessingConfig()
	self.model_cache = {}
	self.device = self._setup_device()

	def _setup_device(self) -> str:
	"""Setup and return the best available device"""
	if torch.cuda.is_available():
	return "cuda"
	else:
	return "cpu"

	async def transcribe(
	self,
	audio_file_path: str,
	model_size: str = "turbo",
	language: Optional[str] = None,
	enable_speaker_diarization: bool = False
	) -> TranscriptionResult:
	"""Transcribe audio using local Whisper model"""

	try:
	# Validate audio file
	audio_path = pathlib.Path(audio_file_path)
	if not audio_path.exists():
	raise TranscriptionError(
	f"Audio file not found: {audio_file_path}",
	audio_file=audio_file_path
	)

	# Load model
	model = self._load_model(model_size)

	# Transcribe
	start_time = time.time()
	result = model.transcribe(
	str(audio_path),
	language=language,
	verbose=False
	)
	processing_time = time.time() - start_time

	# Convert to our format
	segments = []
	for seg in result.get("segments", []):
	segments.append(TranscriptionSegment(
	start=seg["start"],
	end=seg["end"],
	text=seg["text"].strip(),
	confidence=seg.get("avg_logprob")
	))

	return TranscriptionResult(
	text=result.get("text", "").strip(),
	segments=segments,
	language=result.get("language", "unknown"),
	model_used=model_size,
	audio_duration=result.get("duration", 0),
	processing_time=processing_time,
	speaker_diarization_enabled=enable_speaker_diarization,
	global_speaker_count=0,
	error_message=None
	)

	except Exception as e:
	raise TranscriptionError(
	f"Whisper transcription failed: {str(e)}",
	model=model_size,
	audio_file=audio_file_path
	)

	def _load_model(self, model_size: str):
	"""Load Whisper model with caching"""
	if model_size not in self.model_cache:
	try:
	print(f"📥 Loading Whisper model: {model_size}")
	self.model_cache[model_size] = whisper.load_model(
	model_size,
	device=self.device
	)
	except Exception as e:
	raise ModelLoadError(
	f"Failed to load model {model_size}: {str(e)}",
	model_name=model_size
	)

	return self.model_cache[model_size]

	def get_supported_models(self) -> List[str]:
	"""Get list of supported model sizes"""
	return list(self.config.whisper_models.keys())

	def get_supported_languages(self) -> List[str]:
	"""Get list of supported language codes"""
	return ["en", "zh", "ja", "ko", "es", "fr", "de", "ru", "auto"]