Spaces:

Agents-MCP-Hackathon
/

ModalTranscriberMCP

Running

App Files Files Community

ModalTranscriberMCP / src /adapters /modal_adapter.py

richard-su

Upload folder using huggingface_hub

b5df735 verified 8 days ago

raw

history blame

4.86 kB

	"""
	Modal transcription adapter for remote processing
	"""

	import requests
	import base64
	import pathlib
	from typing import List, Optional

	from ..interfaces.transcriber import ITranscriber, TranscriptionResult, TranscriptionSegment
	from ..utils.config import AudioProcessingConfig
	from ..utils.errors import TranscriptionError


	class ModalTranscriptionAdapter(ITranscriber):
	"""Adapter for Modal remote transcription processing"""

	def __init__(self, config: Optional[AudioProcessingConfig] = None, endpoint_url: Optional[str] = None):
	self.config = config or AudioProcessingConfig()
	self.endpoint_url = endpoint_url

	async def transcribe(
	self,
	audio_file_path: str,
	model_size: str = "turbo",
	language: Optional[str] = None,
	enable_speaker_diarization: bool = False
	) -> TranscriptionResult:
	"""Transcribe audio using Modal endpoint"""

	if not self.endpoint_url:
	raise TranscriptionError(
	"Modal endpoint URL not configured",
	model=model_size,
	audio_file=audio_file_path
	)

	try:
	# Read and encode audio file
	audio_path = pathlib.Path(audio_file_path)
	if not audio_path.exists():
	raise TranscriptionError(
	f"Audio file not found: {audio_file_path}",
	audio_file=audio_file_path
	)

	with open(audio_path, 'rb') as f:
	audio_data = f.read()

	audio_base64 = base64.b64encode(audio_data).decode('utf-8')

	# Prepare request data
	request_data = {
	"audio_file_data": audio_base64,
	"audio_file_name": audio_path.name,
	"model_size": model_size,
	"language": language,
	"output_format": "json",
	"enable_speaker_diarization": enable_speaker_diarization
	}

	print(f"🔄 Sending transcription request to Modal endpoint")
	print(f"📁 File: {audio_file_path} ({len(audio_data) / (1024*1024):.2f} MB)")
	print(f"🔧 Model: {model_size}, Speaker diarization: {enable_speaker_diarization}")

	# Make request to Modal endpoint
	response = requests.post(
	self.endpoint_url,
	json=request_data,
	timeout=1800 # 30 minutes timeout
	)

	response.raise_for_status()
	result = response.json()

	print(f"✅ Modal transcription completed")

	# Convert result to TranscriptionResult format
	return self._convert_modal_result(result)

	except requests.exceptions.RequestException as e:
	raise TranscriptionError(
	f"Failed to call Modal endpoint: {str(e)}",
	model=model_size,
	audio_file=audio_file_path
	)
	except Exception as e:
	raise TranscriptionError(
	f"Modal transcription failed: {str(e)}",
	model=model_size,
	audio_file=audio_file_path
	)

	def get_supported_models(self) -> List[str]:
	"""Get list of supported model sizes"""
	return list(self.config.whisper_models.keys())

	def get_supported_languages(self) -> List[str]:
	"""Get list of supported language codes"""
	return ["en", "zh", "ja", "ko", "es", "fr", "de", "ru", "auto"]

	def _convert_modal_result(self, modal_result: dict) -> TranscriptionResult:
	"""Convert Modal result format to TranscriptionResult"""

	# Extract segments if available
	segments = []
	if "segments" in modal_result:
	for seg in modal_result["segments"]:
	segments.append(TranscriptionSegment(
	start=seg.get("start", 0),
	end=seg.get("end", 0),
	text=seg.get("text", ""),
	speaker=seg.get("speaker")
	))

	return TranscriptionResult(
	text=modal_result.get("text", ""),
	segments=segments,
	language=modal_result.get("language_detected", "unknown"),
	model_used=modal_result.get("model_used", "unknown"),
	audio_duration=modal_result.get("audio_duration", 0),
	processing_time=modal_result.get("processing_time", 0),
	speaker_diarization_enabled=modal_result.get("speaker_diarization_enabled", False),
	global_speaker_count=modal_result.get("global_speaker_count", 0),
	error_message=modal_result.get("error_message")
	)