Spaces:

UcsTurkey
/

flare

Building

File size: 7,212 Bytes

"""
Deepgram Speech-to-Text Implementation - Simple Batch Mode
"""
from typing import Optional, List
from datetime import datetime
import io
import wave
import aiohttp
import json
from utils.logger import log_info, log_error, log_debug, log_warning
from .stt_interface import STTInterface, STTConfig, TranscriptionResult


class DeepgramSTT(STTInterface):
    def __init__(self, api_key: str):
        """
        Initialize Deepgram STT
        Args:
            api_key: Deepgram API key
        """
        try:
            self.api_key = api_key
            self.base_url = "https://api.deepgram.com/v1/listen"
            
            log_info("✅ Deepgram STT initialized in batch mode")
            
        except Exception as e:
            log_error(f"❌ Failed to initialize Deepgram STT: {str(e)}")
            raise
    
    def _map_language_code(self, language: str) -> str:
        """Map language codes to Deepgram format"""
        # Deepgram uses different language codes
        language_map = {
            "tr": "tr",
            "tr-TR": "tr",
            "en": "en-US",
            "en-US": "en-US",
            "en-GB": "en-GB",
            "de": "de",
            "de-DE": "de",
            "fr": "fr",
            "fr-FR": "fr",
            "es": "es",
            "es-ES": "es",
            "it": "it",
            "it-IT": "it",
            "pt": "pt-BR",
            "pt-BR": "pt-BR",
            "ru": "ru",
            "ru-RU": "ru",
            "ja": "ja",
            "ja-JP": "ja",
            "ko": "ko",
            "ko-KR": "ko",
            "zh": "zh-CN",
            "zh-CN": "zh-CN",
            "ar": "ar",
            "ar-SA": "ar",
        }
        
        # Default to the language itself if not in map
        return language_map.get(language, language)
    
    async def transcribe(self, audio_data: bytes, config: STTConfig) -> Optional[TranscriptionResult]:
        """Transcribe audio data using Deepgram API"""
        try:
            # Check if we have audio to transcribe
            if not audio_data:
                log_warning("⚠️ No audio data provided")
                return None
            
            log_info(f"📊 Transcribing {len(audio_data)} bytes of audio")
            
            # Convert to WAV format for better compatibility
            wav_audio = self._convert_to_wav(audio_data, config.sample_rate)
            
            # Build Deepgram API parameters
            language = self._map_language_code(config.language)
            
            params = {
                "language": language,
                "punctuate": str(config.enable_punctuation).lower(),
                "model": config.model if config.model != "latest_long" else "general",
                "tier": "enhanced" if config.use_enhanced else "base",
            }
            
            # Add word timestamps if requested
            if config.enable_word_timestamps:
                params["utterances"] = "true"
                params["words"] = "true"
            
            # Build URL with parameters
            url = f"{self.base_url}?" + "&".join([f"{k}={v}" for k, v in params.items()])
            
            # Prepare headers
            headers = {
                "Authorization": f"Token {self.api_key}",
                "Content-Type": "audio/wav"
            }
            
            # Make API request
            log_info(f"🔄 Sending audio to Deepgram API...")
            async with aiohttp.ClientSession() as session:
                async with session.post(url, headers=headers, data=wav_audio) as response:
                    if response.status == 200:
                        result = await response.json()
                        
                        # Extract transcription from response
                        if result.get("results") and result["results"].get("channels"):
                            channel = result["results"]["channels"][0]
                            if channel.get("alternatives"):
                                alternative = channel["alternatives"][0]
                                
                                # Extract word timestamps if available
                                word_timestamps = None
                                if config.enable_word_timestamps and alternative.get("words"):
                                    word_timestamps = [
                                        {
                                            "word": word["word"],
                                            "start_time": word["start"],
                                            "end_time": word["end"]
                                        }
                                        for word in alternative["words"]
                                    ]
                                
                                transcription = TranscriptionResult(
                                    text=alternative.get("transcript", ""),
                                    confidence=alternative.get("confidence", 0.0),
                                    timestamp=datetime.now().timestamp(),
                                    language=language,
                                    word_timestamps=word_timestamps
                                )
                                
                                log_info(f"✅ Transcription: '{transcription.text}' (confidence: {transcription.confidence:.2f})")
                                return transcription
                        
                        log_warning("⚠️ No transcription in response")
                        return None
                    else:
                        error_text = await response.text()
                        log_error(f"❌ Deepgram API error: {response.status} - {error_text}")
                        return None
            
        except Exception as e:
            log_error(f"❌ Error during transcription: {str(e)}")
            import traceback
            log_error(f"Traceback: {traceback.format_exc()}")
            return None
    
    def _convert_to_wav(self, audio_data: bytes, sample_rate: int) -> bytes:
        """Convert raw PCM audio to WAV format"""
        # Create WAV file in memory
        wav_buffer = io.BytesIO()
        
        with wave.open(wav_buffer, 'wb') as wav_file:
            # Set WAV parameters
            wav_file.setnchannels(1)  # Mono
            wav_file.setsampwidth(2)  # 16-bit
            wav_file.setframerate(sample_rate)
            wav_file.writeframes(audio_data)
        
        # Get WAV data
        wav_buffer.seek(0)
        return wav_buffer.read()

    def get_supported_languages(self) -> List[str]:
        """Get list of supported language codes"""
        # Deepgram'ın desteklediği dil kodları
        # Kaynak: https://developers.deepgram.com/docs/models-languages
        return [
            "en", "es", "fr", "de", "it", "pt", "ru", "tr", "pl", "nl",
            "sv", "no", "fi", "da", "ja", "ko", "zh", "hi", "id", "th",
            "uk", "cs", "el", "he", "ar", "fa", "ta", "tl"
        ]

    def get_provider_name(self) -> str:
        """Get provider name"""
        return "deepgram"