Spaces:

Agents-MCP-Hackathon
/

ModalTranscriberMCP

Running

File size: 2,923 Bytes

b5df735

"""
Speaker identification and embedding management interfaces
"""

from abc import ABC, abstractmethod
from typing import Dict, Any, Optional, List, Tuple
from dataclasses import dataclass
import numpy as np


@dataclass
class SpeakerEmbedding:
    """Speaker embedding data structure"""
    speaker_id: str
    embedding: np.ndarray
    confidence: float
    source_files: List[str]
    sample_count: int
    created_at: str
    updated_at: str


@dataclass
class SpeakerSegment:
    """Speaker segment information"""
    start: float
    end: float
    speaker_id: str
    confidence: float


class ISpeakerEmbeddingManager(ABC):
    """Interface for speaker embedding management"""
    
    @abstractmethod
    async def find_matching_speaker(
        self,
        embedding: np.ndarray,
        source_file: str
    ) -> Optional[str]:
        """Find matching speaker from existing embeddings"""
        pass
    
    @abstractmethod
    async def add_or_update_speaker(
        self,
        embedding: np.ndarray,
        source_file: str,
        confidence: float = 1.0,
        original_label: Optional[str] = None
    ) -> str:
        """Add new speaker or update existing speaker"""
        pass
    
    @abstractmethod
    async def map_local_to_global_speakers(
        self,
        local_embeddings: Dict[str, np.ndarray],
        source_file: str
    ) -> Dict[str, str]:
        """Map local speaker labels to global speaker IDs"""
        pass
    
    @abstractmethod
    async def get_speaker_info(self, speaker_id: str) -> Optional[SpeakerEmbedding]:
        """Get speaker information by ID"""
        pass
    
    @abstractmethod
    async def get_all_speakers_summary(self) -> Dict[str, Any]:
        """Get summary of all speakers"""
        pass
    
    @abstractmethod
    async def save_speakers(self) -> None:
        """Save speaker data to storage"""
        pass
    
    @abstractmethod
    async def load_speakers(self) -> None:
        """Load speaker data from storage"""
        pass


class ISpeakerIdentificationService(ABC):
    """Interface for speaker identification operations"""
    
    @abstractmethod
    async def extract_speaker_embeddings(
        self,
        audio_path: str,
        segments: List[SpeakerSegment]
    ) -> Dict[str, np.ndarray]:
        """Extract speaker embeddings from audio segments"""
        pass
    
    @abstractmethod
    async def identify_speakers_in_audio(
        self,
        audio_path: str,
        transcription_segments: List[Dict[str, Any]]
    ) -> List[SpeakerSegment]:
        """Identify speakers in audio file"""
        pass
    
    @abstractmethod
    async def map_transcription_to_speakers(
        self,
        transcription_segments: List[Dict[str, Any]],
        speaker_segments: List[SpeakerSegment]
    ) -> List[Dict[str, Any]]:
        """Map transcription segments to speaker information"""
        pass