flare / stt_interface.py
ciyidogan's picture
Update stt_interface.py
e579c02 verified
raw
history blame
2.15 kB
"""
STT (Speech-to-Text) Interface for Flare
"""
from abc import ABC, abstractmethod
from typing import Optional, Dict, Any, AsyncIterator, List
from dataclasses import dataclass
from enum import Enum
import json
class STTEngineType(Enum):
NO_STT = "no_stt"
GOOGLE = "google"
AZURE = "azure"
AMAZON = "amazon"
FLICKER = "flicker"
@dataclass
class STTConfig:
"""STT configuration parameters"""
language: str = "tr-TR"
sample_rate: int = 16000
encoding: str = "WEBM_OPUS"
enable_punctuation: bool = True
enable_word_timestamps: bool = False
model: str = "latest_long"
use_enhanced: bool = True
single_utterance: bool = False
interim_results: bool = True
# Voice Activity Detection
vad_enabled: bool = True
speech_timeout_ms: int = 2000
# Noise reduction
noise_reduction_enabled: bool = True
noise_reduction_level: int = 2
@dataclass
class TranscriptionResult:
"""Result from STT engine"""
text: str
is_final: bool
confidence: float
timestamp: float
word_timestamps: Optional[List[Dict]] = None
language: Optional[str] = None
is_interrupt: bool = False
class STTInterface(ABC):
"""Abstract base class for STT providers"""
@abstractmethod
async def start_streaming(self, config: STTConfig) -> None:
"""Start streaming session"""
pass
@abstractmethod
async def stream_audio(self, audio_chunk: bytes) -> AsyncIterator[TranscriptionResult]:
"""Stream audio chunk and get transcription results"""
pass
@abstractmethod
async def stop_streaming(self) -> Optional[TranscriptionResult]:
"""Stop streaming and get final result"""
pass
@abstractmethod
def supports_realtime(self) -> bool:
"""Check if provider supports real-time streaming"""
pass
@abstractmethod
def get_supported_languages(self) -> List[str]:
"""Get list of supported language codes"""
pass
@abstractmethod
def get_provider_name(self) -> str:
"""Get provider name for logging"""
pass