File size: 7,212 Bytes
5d50ed0
165e2d0
5d50ed0
165e2d0
5d50ed0
165e2d0
 
 
 
5d50ed0
 
 
 
 
 
165e2d0
 
 
 
 
 
 
 
 
 
 
 
 
 
5d50ed0
 
 
165e2d0
5d50ed0
911913b
5d50ed0
911913b
5d50ed0
 
911913b
5d50ed0
911913b
5d50ed0
911913b
5d50ed0
911913b
5d50ed0
911913b
5d50ed0
911913b
5d50ed0
911913b
5d50ed0
911913b
5d50ed0
911913b
5d50ed0
911913b
5d50ed0
 
911913b
 
 
c3db99d
165e2d0
 
5d50ed0
165e2d0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6472323
165e2d0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6472323
165e2d0
 
 
 
 
 
5d50ed0
 
165e2d0
 
 
5d50ed0
165e2d0
 
 
 
 
5d50ed0
165e2d0
 
 
 
 
 
 
 
 
 
5d50ed0
 
 
fa1c68b
 
5d50ed0
fa1c68b
 
 
5d50ed0
 
 
 
e90d3a0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
"""
Deepgram Speech-to-Text Implementation - Simple Batch Mode
"""
from typing import Optional, List
from datetime import datetime
import io
import wave
import aiohttp
import json
from utils.logger import log_info, log_error, log_debug, log_warning
from .stt_interface import STTInterface, STTConfig, TranscriptionResult


class DeepgramSTT(STTInterface):
    def __init__(self, api_key: str):
        """
        Initialize Deepgram STT
        Args:
            api_key: Deepgram API key
        """
        try:
            self.api_key = api_key
            self.base_url = "https://api.deepgram.com/v1/listen"
            
            log_info("✅ Deepgram STT initialized in batch mode")
            
        except Exception as e:
            log_error(f"❌ Failed to initialize Deepgram STT: {str(e)}")
            raise
    
    def _map_language_code(self, language: str) -> str:
        """Map language codes to Deepgram format"""
        # Deepgram uses different language codes
        language_map = {
            "tr": "tr",
            "tr-TR": "tr",
            "en": "en-US",
            "en-US": "en-US",
            "en-GB": "en-GB",
            "de": "de",
            "de-DE": "de",
            "fr": "fr",
            "fr-FR": "fr",
            "es": "es",
            "es-ES": "es",
            "it": "it",
            "it-IT": "it",
            "pt": "pt-BR",
            "pt-BR": "pt-BR",
            "ru": "ru",
            "ru-RU": "ru",
            "ja": "ja",
            "ja-JP": "ja",
            "ko": "ko",
            "ko-KR": "ko",
            "zh": "zh-CN",
            "zh-CN": "zh-CN",
            "ar": "ar",
            "ar-SA": "ar",
        }
        
        # Default to the language itself if not in map
        return language_map.get(language, language)
    
    async def transcribe(self, audio_data: bytes, config: STTConfig) -> Optional[TranscriptionResult]:
        """Transcribe audio data using Deepgram API"""
        try:
            # Check if we have audio to transcribe
            if not audio_data:
                log_warning("⚠️ No audio data provided")
                return None
            
            log_info(f"📊 Transcribing {len(audio_data)} bytes of audio")
            
            # Convert to WAV format for better compatibility
            wav_audio = self._convert_to_wav(audio_data, config.sample_rate)
            
            # Build Deepgram API parameters
            language = self._map_language_code(config.language)
            
            params = {
                "language": language,
                "punctuate": str(config.enable_punctuation).lower(),
                "model": config.model if config.model != "latest_long" else "general",
                "tier": "enhanced" if config.use_enhanced else "base",
            }
            
            # Add word timestamps if requested
            if config.enable_word_timestamps:
                params["utterances"] = "true"
                params["words"] = "true"
            
            # Build URL with parameters
            url = f"{self.base_url}?" + "&".join([f"{k}={v}" for k, v in params.items()])
            
            # Prepare headers
            headers = {
                "Authorization": f"Token {self.api_key}",
                "Content-Type": "audio/wav"
            }
            
            # Make API request
            log_info(f"🔄 Sending audio to Deepgram API...")
            async with aiohttp.ClientSession() as session:
                async with session.post(url, headers=headers, data=wav_audio) as response:
                    if response.status == 200:
                        result = await response.json()
                        
                        # Extract transcription from response
                        if result.get("results") and result["results"].get("channels"):
                            channel = result["results"]["channels"][0]
                            if channel.get("alternatives"):
                                alternative = channel["alternatives"][0]
                                
                                # Extract word timestamps if available
                                word_timestamps = None
                                if config.enable_word_timestamps and alternative.get("words"):
                                    word_timestamps = [
                                        {
                                            "word": word["word"],
                                            "start_time": word["start"],
                                            "end_time": word["end"]
                                        }
                                        for word in alternative["words"]
                                    ]
                                
                                transcription = TranscriptionResult(
                                    text=alternative.get("transcript", ""),
                                    confidence=alternative.get("confidence", 0.0),
                                    timestamp=datetime.now().timestamp(),
                                    language=language,
                                    word_timestamps=word_timestamps
                                )
                                
                                log_info(f"✅ Transcription: '{transcription.text}' (confidence: {transcription.confidence:.2f})")
                                return transcription
                        
                        log_warning("⚠️ No transcription in response")
                        return None
                    else:
                        error_text = await response.text()
                        log_error(f"❌ Deepgram API error: {response.status} - {error_text}")
                        return None
            
        except Exception as e:
            log_error(f"❌ Error during transcription: {str(e)}")
            import traceback
            log_error(f"Traceback: {traceback.format_exc()}")
            return None
    
    def _convert_to_wav(self, audio_data: bytes, sample_rate: int) -> bytes:
        """Convert raw PCM audio to WAV format"""
        # Create WAV file in memory
        wav_buffer = io.BytesIO()
        
        with wave.open(wav_buffer, 'wb') as wav_file:
            # Set WAV parameters
            wav_file.setnchannels(1)  # Mono
            wav_file.setsampwidth(2)  # 16-bit
            wav_file.setframerate(sample_rate)
            wav_file.writeframes(audio_data)
        
        # Get WAV data
        wav_buffer.seek(0)
        return wav_buffer.read()

    def get_supported_languages(self) -> List[str]:
        """Get list of supported language codes"""
        # Deepgram'ın desteklediği dil kodları
        # Kaynak: https://developers.deepgram.com/docs/models-languages
        return [
            "en", "es", "fr", "de", "it", "pt", "ru", "tr", "pl", "nl",
            "sv", "no", "fi", "da", "ja", "ko", "zh", "hi", "id", "th",
            "uk", "cs", "el", "he", "ar", "fa", "ta", "tl"
        ]

    def get_provider_name(self) -> str:
        """Get provider name"""
        return "deepgram"