Spaces:
Building
Building
""" | |
STT (Speech-to-Text) Interface for Flare | |
""" | |
from abc import ABC, abstractmethod | |
from typing import Optional, Dict, Any, AsyncIterator, List | |
from dataclasses import dataclass | |
from enum import Enum | |
import json | |
class STTEngineType(Enum): | |
NO_STT = "no_stt" | |
GOOGLE = "google" | |
AZURE = "azure" | |
AMAZON = "amazon" | |
FLICKER = "flicker" | |
class STTConfig: | |
"""STT configuration parameters""" | |
language: str = "tr-TR" | |
sample_rate: int = 16000 | |
encoding: str = "WEBM_OPUS" | |
enable_punctuation: bool = True | |
enable_word_timestamps: bool = False | |
model: str = "latest_long" | |
use_enhanced: bool = True | |
single_utterance: bool = False | |
interim_results: bool = True | |
# Voice Activity Detection | |
vad_enabled: bool = True | |
speech_timeout_ms: int = 2000 | |
# Noise reduction | |
noise_reduction_enabled: bool = True | |
noise_reduction_level: int = 2 | |
class TranscriptionResult: | |
"""Result from STT engine""" | |
text: str | |
is_final: bool | |
confidence: float | |
timestamp: float | |
word_timestamps: Optional[List[Dict]] = None | |
language: Optional[str] = None | |
is_interrupt: bool = False | |
class STTInterface(ABC): | |
"""Abstract base class for STT providers""" | |
async def start_streaming(self, config: STTConfig) -> None: | |
"""Start streaming session""" | |
pass | |
async def stream_audio(self, audio_chunk: bytes) -> AsyncIterator[TranscriptionResult]: | |
"""Stream audio chunk and get transcription results""" | |
pass | |
async def stop_streaming(self) -> Optional[TranscriptionResult]: | |
"""Stop streaming and get final result""" | |
pass | |
def supports_realtime(self) -> bool: | |
"""Check if provider supports real-time streaming""" | |
pass | |
def get_supported_languages(self) -> List[str]: | |
"""Get list of supported language codes""" | |
pass | |
def get_provider_name(self) -> str: | |
"""Get provider name for logging""" | |
pass |