Spaces:

Agents-MCP-Hackathon
/

ModalTranscriberMCP

Running

App Files Files Community

ModalTranscriberMCP / src /core /speaker_diarization.py

richard-su

Upload folder using huggingface_hub

b5df735 verified 8 days ago

raw

history blame

4.52 kB

	"""
	Speaker diarization implementation using pyannote.audio
	"""

	import os
	import torch
	from typing import Optional, List, Dict, Any

	from ..interfaces.speaker_detector import ISpeakerDetector
	from ..utils.config import AudioProcessingConfig
	from ..utils.errors import SpeakerDiarizationError, ModelLoadError


	class PyannoteSpeikerDetector(ISpeakerDetector):
	"""Speaker diarization using pyannote.audio"""

	def __init__(self, config: Optional[AudioProcessingConfig] = None):
	self.config = config or AudioProcessingConfig()
	self.device = self._setup_device()
	self.pipeline = None
	self.auth_token = os.environ.get(self.config.hf_token_env_var)

	if not self.auth_token:
	print("⚠️ No Hugging Face token found. Speaker diarization will be disabled.")

	def _setup_device(self) -> torch.device:
	"""Setup and return the best available device"""
	if torch.cuda.is_available():
	return torch.device("cuda")
	else:
	return torch.device("cpu")

	async def detect_speakers(
	self,
	audio_file_path: str,
	num_speakers: Optional[int] = None,
	min_speakers: int = 1,
	max_speakers: int = 10
	) -> Dict[str, Any]:
	"""Detect speakers in audio file"""

	if not self.auth_token:
	raise SpeakerDiarizationError(
	"Speaker diarization requires Hugging Face token",
	audio_file=audio_file_path
	)

	try:
	# Load pipeline if not already loaded
	if self.pipeline is None:
	self.pipeline = self._load_pipeline()

	# Perform diarization
	diarization = self.pipeline(audio_file_path)

	# Convert to our format
	speakers = {}
	segments = []

	for turn, _, speaker in diarization.itertracks(yield_label=True):
	speaker_id = f"SPEAKER_{speaker.split('_')[-1].zfill(2)}"
	segments.append({
	"start": turn.start,
	"end": turn.end,
	"speaker": speaker_id
	})

	if speaker_id not in speakers:
	speakers[speaker_id] = {
	"id": speaker_id,
	"total_time": 0.0,
	"segments": []
	}

	speakers[speaker_id]["total_time"] += turn.end - turn.start
	speakers[speaker_id]["segments"].append({
	"start": turn.start,
	"end": turn.end
	})

	return {
	"speaker_count": len(speakers),
	"speakers": speakers,
	"segments": segments,
	"audio_file": audio_file_path
	}

	except Exception as e:
	raise SpeakerDiarizationError(
	f"Speaker detection failed: {str(e)}",
	audio_file=audio_file_path
	)

	def _load_pipeline(self):
	"""Load pyannote speaker diarization pipeline"""
	try:
	# Suppress warnings
	import warnings
	warnings.filterwarnings("ignore", category=UserWarning, module="pyannote")
	warnings.filterwarnings("ignore", category=UserWarning, module="pytorch_lightning")
	warnings.filterwarnings("ignore", category=FutureWarning, module="pytorch_lightning")

	from pyannote.audio import Pipeline

	print("📥 Loading speaker diarization pipeline...")
	pipeline = Pipeline.from_pretrained(
	self.config.speaker_diarization_model,
	use_auth_token=self.auth_token
	)
	pipeline.to(self.device)

	return pipeline

	except Exception as e:
	raise ModelLoadError(
	f"Failed to load speaker diarization pipeline: {str(e)}",
	model_name=self.config.speaker_diarization_model
	)

	def get_supported_models(self) -> List[str]:
	"""Get list of supported speaker diarization models"""
	return [self.config.speaker_diarization_model]

	def is_available(self) -> bool:
	"""Check if speaker diarization is available"""
	return self.auth_token is not None