""" Deepgram Speech-to-Text Implementation - Simple Batch Mode """ from typing import Optional, List from datetime import datetime import io import wave import aiohttp import json from utils.logger import log_info, log_error, log_debug, log_warning from .stt_interface import STTInterface, STTConfig, TranscriptionResult class DeepgramSTT(STTInterface): def __init__(self, api_key: str): """ Initialize Deepgram STT Args: api_key: Deepgram API key """ try: self.api_key = api_key self.base_url = "https://api.deepgram.com/v1/listen" log_info("✅ Deepgram STT initialized in batch mode") except Exception as e: log_error(f"❌ Failed to initialize Deepgram STT: {str(e)}") raise def _map_language_code(self, language: str) -> str: """Map language codes to Deepgram format""" # Deepgram uses different language codes language_map = { "tr": "tr", "tr-TR": "tr", "en": "en-US", "en-US": "en-US", "en-GB": "en-GB", "de": "de", "de-DE": "de", "fr": "fr", "fr-FR": "fr", "es": "es", "es-ES": "es", "it": "it", "it-IT": "it", "pt": "pt-BR", "pt-BR": "pt-BR", "ru": "ru", "ru-RU": "ru", "ja": "ja", "ja-JP": "ja", "ko": "ko", "ko-KR": "ko", "zh": "zh-CN", "zh-CN": "zh-CN", "ar": "ar", "ar-SA": "ar", } # Default to the language itself if not in map return language_map.get(language, language) async def transcribe(self, audio_data: bytes, config: STTConfig) -> Optional[TranscriptionResult]: """Transcribe audio data using Deepgram API""" try: # Check if we have audio to transcribe if not audio_data: log_warning("⚠️ No audio data provided") return None log_info(f"📊 Transcribing {len(audio_data)} bytes of audio") # Convert to WAV format for better compatibility wav_audio = self._convert_to_wav(audio_data, config.sample_rate) # Build Deepgram API parameters language = self._map_language_code(config.language) params = { "language": language, "punctuate": str(config.enable_punctuation).lower(), "model": config.model if config.model != "latest_long" else "general", "tier": "enhanced" if config.use_enhanced else "base", } # Add word timestamps if requested if config.enable_word_timestamps: params["utterances"] = "true" params["words"] = "true" # Build URL with parameters url = f"{self.base_url}?" + "&".join([f"{k}={v}" for k, v in params.items()]) # Prepare headers headers = { "Authorization": f"Token {self.api_key}", "Content-Type": "audio/wav" } # Make API request log_info(f"🔄 Sending audio to Deepgram API...") async with aiohttp.ClientSession() as session: async with session.post(url, headers=headers, data=wav_audio) as response: if response.status == 200: result = await response.json() # Extract transcription from response if result.get("results") and result["results"].get("channels"): channel = result["results"]["channels"][0] if channel.get("alternatives"): alternative = channel["alternatives"][0] # Extract word timestamps if available word_timestamps = None if config.enable_word_timestamps and alternative.get("words"): word_timestamps = [ { "word": word["word"], "start_time": word["start"], "end_time": word["end"] } for word in alternative["words"] ] transcription = TranscriptionResult( text=alternative.get("transcript", ""), confidence=alternative.get("confidence", 0.0), timestamp=datetime.now().timestamp(), language=language, word_timestamps=word_timestamps ) log_info(f"✅ Transcription: '{transcription.text}' (confidence: {transcription.confidence:.2f})") return transcription log_warning("⚠️ No transcription in response") return None else: error_text = await response.text() log_error(f"❌ Deepgram API error: {response.status} - {error_text}") return None except Exception as e: log_error(f"❌ Error during transcription: {str(e)}") import traceback log_error(f"Traceback: {traceback.format_exc()}") return None def _convert_to_wav(self, audio_data: bytes, sample_rate: int) -> bytes: """Convert raw PCM audio to WAV format""" # Create WAV file in memory wav_buffer = io.BytesIO() with wave.open(wav_buffer, 'wb') as wav_file: # Set WAV parameters wav_file.setnchannels(1) # Mono wav_file.setsampwidth(2) # 16-bit wav_file.setframerate(sample_rate) wav_file.writeframes(audio_data) # Get WAV data wav_buffer.seek(0) return wav_buffer.read() def get_supported_languages(self) -> List[str]: """Get list of supported language codes""" # Deepgram'ın desteklediği dil kodları # Kaynak: https://developers.deepgram.com/docs/models-languages return [ "en", "es", "fr", "de", "it", "pt", "ru", "tr", "pl", "nl", "sv", "no", "fi", "da", "ja", "ko", "zh", "hi", "id", "th", "uk", "cs", "el", "he", "ar", "fa", "ta", "tl" ] def get_provider_name(self) -> str: """Get provider name""" return "deepgram"