ciyidogan commited on
Commit
165e2d0
Β·
verified Β·
1 Parent(s): a532986

Update stt/stt_deepgram.py

Browse files
Files changed (1) hide show
  1. stt/stt_deepgram.py +116 -373
stt/stt_deepgram.py CHANGED
@@ -1,57 +1,36 @@
1
  """
2
- Deepgram Speech-to-Text Implementation using Deepgram SDK
3
  """
4
- import asyncio
5
- from typing import AsyncIterator, Optional, List, Any
6
  from datetime import datetime
7
- import queue
8
- import threading
9
- import traceback
10
-
11
- from deepgram import (
12
- DeepgramClient,
13
- DeepgramClientOptions,
14
- LiveTranscriptionEvents,
15
- LiveOptions,
16
- Microphone,
17
- )
18
-
19
  from utils.logger import log_info, log_error, log_debug, log_warning
20
  from .stt_interface import STTInterface, STTConfig, TranscriptionResult
21
 
22
 
23
  class DeepgramSTT(STTInterface):
24
- """Deepgram STT implementation using official SDK"""
25
-
26
  def __init__(self, api_key: str):
27
- if not api_key:
28
- raise ValueError("Deepgram API key is required")
29
-
30
- # Debug iΓ§in API key'in ilk 10 karakterini logla
31
- log_info(f"πŸ”‘ Deepgram API key resolved: {api_key[:10]}... (length: {len(api_key)})")
 
 
 
 
 
 
 
 
 
32
 
33
- self.api_key = api_key
34
- self.deepgram_client = None
35
- self.live_connection = None
36
- self.is_streaming = False
37
- self.responses_queue = queue.Queue(maxsize=100)
38
-
39
- # Session tracking
40
- self.session_id = 0
41
- self.total_audio_bytes = 0
42
- self.total_chunks = 0
43
-
44
- # Final result tracking
45
- self.final_result_received = False
46
- self.stop_event = threading.Event()
47
-
48
- # βœ… Initial buffer for better VAD context
49
- self.initial_buffer = []
50
-
51
- log_info(f"βœ… Deepgram STT initialized (SDK version)")
52
-
53
  def _map_language_code(self, language: str) -> str:
54
  """Map language codes to Deepgram format"""
 
55
  language_map = {
56
  "tr-TR": "tr",
57
  "en-US": "en-US",
@@ -67,345 +46,109 @@ class DeepgramSTT(STTInterface):
67
  "zh-CN": "zh-CN",
68
  "ar-SA": "ar",
69
  }
70
- return language_map.get(language, language)
71
-
72
- async def start_streaming(self, config: STTConfig) -> None:
73
- """Initialize streaming session using SDK"""
74
- try:
75
- # Stop any existing stream
76
- if self.is_streaming:
77
- log_warning("⚠️ Previous stream still active, stopping it first")
78
- await self.stop_streaming()
79
- await asyncio.sleep(0.5)
80
-
81
- # Reset session data
82
- self._reset_session_data()
83
-
84
- log_info(f"🎀 Starting Deepgram STT (SDK) - Session #{self.session_id}")
85
-
86
- # Create Deepgram client with more verbose logging for debugging
87
- config_options = DeepgramClientOptions(
88
- verbose=False,
89
- options={"keepalive": "true"}
90
- )
91
- self.deepgram_client = DeepgramClient(self.api_key, config=config_options)
92
-
93
- # Try minimal configuration first
94
- options = LiveOptions(
95
- language="tr",
96
- model="nova-2-general",
97
- encoding="linear16",
98
- sample_rate=16000,
99
- interim_results=True, # Bu zorunlu, yoksa final result da gelmiyor...
100
- channels=1,
101
- #utterance_end_ms=2000, # 2 second silence = end
102
- punctuate=True,
103
- smart_format=True,
104
- #numerals=True,
105
- #profanity_filter=False,
106
- #redact=False,
107
- no_delay=True,
108
- vad_events=True, # Enable VAD events
109
- #endpointing=1000
110
- utterance_end_ms=2000
111
- )
112
-
113
- log_info(f"πŸ”§ Deepgram options: language=tr, model=nova-2, encoding=linear16, interim_results=True")
114
-
115
- # Create live connection
116
- self.live_connection = self.deepgram_client.listen.live.v("1")
117
-
118
- # Setup event handlers
119
- self._setup_event_handlers()
120
-
121
- try:
122
- # Log before connection attempt
123
- log_info("πŸ”Œ Attempting to connect to Deepgram...")
124
-
125
- result = self.live_connection.start(options)
126
- log_info(f"πŸ”Œ Connection start result: {result}")
127
-
128
- if result:
129
- self.is_streaming = True
130
- log_info(f"βœ… Deepgram SDK connected - Ready for speech")
131
- else:
132
- # Try to get more error details
133
- if hasattr(self.live_connection, 'get_error') or hasattr(self.live_connection, 'error'):
134
- error_detail = getattr(self.live_connection, 'error', 'No error details')
135
- log_error(f"❌ Connection failed with details: {error_detail}")
136
- raise RuntimeError("Failed to start Deepgram connection")
137
-
138
- except Exception as e:
139
- log_error(f"❌ Connection error: {str(e)}")
140
- # Log more details about the exception
141
- if hasattr(e, 'response'):
142
- log_error(f"❌ Response: {e.response}")
143
- if hasattr(e, 'status_code'):
144
- log_error(f"❌ Status code: {e.status_code}")
145
- raise
146
-
147
- except Exception as e:
148
- log_error(f"❌ Failed to start Deepgram STT", error=str(e))
149
- if hasattr(e, '__dict__'):
150
- log_error(f"❌ Error details: {e.__dict__}")
151
- self.is_streaming = False
152
- self.live_connection = None
153
- self.deepgram_client = None
154
- raise
155
-
156
- def _setup_event_handlers(self):
157
- """Setup event handlers for Deepgram events"""
158
-
159
- # Transcript received - use the existing class method
160
- self.live_connection.on(LiveTranscriptionEvents.Transcript, self._on_transcript)
161
-
162
- # Speech started
163
- self.live_connection.on(LiveTranscriptionEvents.SpeechStarted, self._on_speech_started)
164
-
165
- # Utterance end
166
- self.live_connection.on(LiveTranscriptionEvents.UtteranceEnd, self._on_utterance_end)
167
-
168
- # Metadata
169
- self.live_connection.on(LiveTranscriptionEvents.Metadata, self._on_metadata)
170
-
171
- # Error
172
- self.live_connection.on(LiveTranscriptionEvents.Error, self._on_error)
173
-
174
- # Connection closed
175
- self.live_connection.on(LiveTranscriptionEvents.Close, self._on_close)
176
-
177
- def _on_transcript(self, *args, **kwargs):
178
- """Handle transcript event - SDK calls this method directly"""
179
- try:
180
- # SDK passes the result as second argument
181
- result = args[1] if len(args) > 1 else kwargs.get("result")
182
-
183
- if not result:
184
- log_warning("⚠️ No result in transcript event")
185
- return
186
-
187
- # βœ… Debug iΓ§in result objesini detaylΔ± inceleyin
188
- if self.total_chunks < 5: # Δ°lk birkaΓ§ event iΓ§in
189
- log_debug(f"πŸ” Result object type: {type(result)}")
190
- log_debug(f"πŸ” Result dir: {[attr for attr in dir(result) if not attr.startswith('_')]}")
191
-
192
- # Result'un tΓΌm property'lerini logla
193
- try:
194
- if hasattr(result, '__dict__'):
195
- log_debug(f"πŸ” Result dict: {result.__dict__}")
196
- except:
197
- pass
198
 
199
- # Access properties directly from the result object
200
- is_final = result.is_final if hasattr(result, 'is_final') else False
201
-
202
- # Get transcript from channel alternatives
203
- if hasattr(result, 'channel') and result.channel:
204
- alternatives = result.channel.alternatives
205
- if alternatives and len(alternatives) > 0:
206
- transcript = alternatives[0].transcript
207
- confidence = alternatives[0].confidence
208
-
209
- # Log all transcripts for debugging
210
- log_debug(f"πŸ“ Raw transcript: '{transcript}' (is_final: {is_final}, confidence: {confidence})")
211
-
212
- # βœ… Γ–NEMLΔ° DEĞİŞİKLΔ°K: Final result'larΔ± boş olsa bile kabul et
213
- if is_final:
214
- # Final transcript - boş olabilir ama yine de işle
215
- transcription_result = TranscriptionResult(
216
- text=transcript or "", # Boş string olabilir
217
- is_final=is_final,
218
- confidence=confidence,
219
- timestamp=datetime.now().timestamp()
220
- )
221
-
222
- try:
223
- self.responses_queue.put(transcription_result)
224
- self.final_result_received = True
225
-
226
- if transcript and transcript.strip():
227
- log_info(f"🎯 FINAL TRANSCRIPT: '{transcript}' (confidence: {confidence:.2f})")
228
- else:
229
- log_warning(f"⚠️ Empty final transcript received - but queued for state change")
230
-
231
- except queue.Full:
232
- log_warning("⚠️ Response queue full")
233
-
234
- elif transcript and transcript.strip():
235
- # Interim result - sadece dolu olanlarΔ± kabul et
236
- transcription_result = TranscriptionResult(
237
- text=transcript,
238
- is_final=is_final,
239
- confidence=confidence,
240
- timestamp=datetime.now().timestamp()
241
- )
242
-
243
- try:
244
- self.responses_queue.put(transcription_result)
245
- log_info(f"πŸ“ Interim transcript: '{transcript}'")
246
- except queue.Full:
247
- log_warning("⚠️ Response queue full")
248
-
249
- except Exception as e:
250
- log_error(f"❌ Error processing transcript: {e}")
251
- log_error(f"❌ Args: {args}")
252
- log_error(f"❌ Kwargs: {kwargs}")
253
- import traceback
254
- log_error(f"❌ Traceback: {traceback.format_exc()}")
255
-
256
- def _on_speech_started(self, *args, **kwargs):
257
- """Handle speech started event"""
258
- log_info("🎀 Speech detected - User started speaking")
259
-
260
- def _on_utterance_end(self, *args, **kwargs):
261
- """Handle utterance end event"""
262
- log_info("πŸ”š Speech ended - User stopped speaking")
263
- # Deepgram will send final transcript after this
264
-
265
- def _on_metadata(self, *args, **kwargs):
266
- """Handle metadata event"""
267
- metadata = args[1] if len(args) > 1 else kwargs.get("metadata", {})
268
- request_id = metadata.get("request_id", "")
269
- log_debug(f"πŸ“‹ Deepgram metadata - Request ID: {request_id}")
270
-
271
- def _on_error(self, *args, **kwargs):
272
- """Handle error event"""
273
- error = args[1] if len(args) > 1 else kwargs.get("error", {})
274
- log_error(f"❌ Deepgram error: {error}")
275
-
276
- def _on_close(self, *args, **kwargs):
277
- """Handle connection close event"""
278
- log_info("πŸ”Œ Deepgram connection closed")
279
- self.is_streaming = False
280
-
281
- async def stream_audio(self, audio_chunk: bytes) -> AsyncIterator[TranscriptionResult]:
282
- """Stream audio chunk and get transcription results"""
283
- if not self.is_streaming or not self.live_connection:
284
- raise RuntimeError("Streaming not started. Call start_streaming() first.")
285
-
286
  try:
287
- # βœ… Δ°lk birkaΓ§ chunk'Δ± biriktirip gΓΆnder (daha iyi context)
288
- if not hasattr(self, 'initial_buffer'):
289
- self.initial_buffer = []
290
-
291
- # Δ°lk birkaΓ§ chunk iΓ§in audio formatΔ±nΔ± analiz et
292
- if self.total_chunks < 3:
293
- if len(audio_chunk) >= 4:
294
- import struct
295
- try:
296
- first_sample = struct.unpack('<h', audio_chunk[:2])[0]
297
- log_info(f"πŸ”Š Audio format check - Chunk #{self.total_chunks}: First sample={first_sample}, Size={len(audio_chunk)} bytes")
298
- except:
299
- log_warning("⚠️ Could not parse as Linear16")
300
-
301
- self.initial_buffer.append(audio_chunk)
302
-
303
- # 3. chunk'ta hepsini birden gΓΆnder
304
- if self.total_chunks == 2:
305
- combined_audio = b''.join(self.initial_buffer)
306
- self.live_connection.send(combined_audio)
307
- self.initial_buffer = []
308
- log_info(f"🎯 Sent initial audio buffer: {len(combined_audio)} bytes")
309
- else:
310
- # Send audio to Deepgram (final result gelse bile gΓΆnder, Deepgram kendi handle edecek)
311
- self.live_connection.send(audio_chunk)
312
-
313
- self.total_chunks += 1
314
- self.total_audio_bytes += len(audio_chunk)
315
-
316
- # Log progress
317
- if self.total_chunks % 50 == 0:
318
- log_debug(f"πŸ“Š Listening... {self.total_chunks} chunks, {self.total_audio_bytes/1024:.1f}KB")
319
-
320
- # Check queue for results
321
- while True:
322
- try:
323
- result = self.responses_queue.get_nowait()
324
-
325
- # Log for debugging
326
- log_debug(f"🎯 Yielding result: is_final={result.is_final}, text='{result.text}'")
327
-
328
- yield result
329
 
330
- except queue.Empty:
331
- break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
332
 
333
- except Exception as e:
334
- log_error(f"❌ Error streaming audio", error=str(e))
335
- self.is_streaming = False
336
- raise
337
-
338
- async def stop_streaming(self) -> Optional[TranscriptionResult]:
339
- """Stop streaming and clean up"""
340
- if not self.is_streaming:
341
- log_debug("Already stopped, nothing to do")
342
- return None
343
-
344
- try:
345
- log_info(f"πŸ›‘ Stopping Deepgram STT session #{self.session_id}")
346
-
347
- self.is_streaming = False
348
-
349
- # Finish the stream to get final results
350
- if self.live_connection:
351
- try:
352
- # Finish the stream - this triggers final transcript
353
- self.live_connection.finish()
354
-
355
- # Wait a bit for final result
356
- await asyncio.sleep(0.5)
357
-
358
- except Exception as e:
359
- log_warning(f"⚠️ Error finishing stream: {e}")
360
-
361
- # Get final result from queue
362
- final_result = None
363
- while not self.responses_queue.empty():
364
- try:
365
- result = self.responses_queue.get_nowait()
366
- if result.is_final:
367
- final_result = result
368
- except queue.Empty:
369
- break
370
-
371
- # Clean up
372
- self.live_connection = None
373
- self.deepgram_client = None
374
- self.final_result_received = False
375
-
376
- log_info(f"βœ… Deepgram STT session #{self.session_id} stopped")
377
- return final_result
378
 
379
  except Exception as e:
380
- log_error(f"❌ Error during stop_streaming", error=str(e))
381
- self.is_streaming = False
382
- self.live_connection = None
383
- self.deepgram_client = None
384
  return None
385
-
386
- def _reset_session_data(self):
387
- """Reset session-specific data"""
388
- # Clear queue
389
- while not self.responses_queue.empty():
390
- try:
391
- self.responses_queue.get_nowait()
392
- except:
393
- pass
394
-
395
- # Reset counters
396
- self.total_audio_bytes = 0
397
- self.total_chunks = 0
398
- self.session_id += 1
399
- self.final_result_received = False
400
-
401
- # βœ… Clear initial buffer
402
- self.initial_buffer = []
403
 
404
- log_debug(f"πŸ”„ Session data reset. New session ID: {self.session_id}")
405
-
406
- def supports_realtime(self) -> bool:
407
- """Deepgram supports real-time streaming"""
408
- return True
 
 
 
 
 
409
 
410
  def get_supported_languages(self) -> List[str]:
411
  """Get list of supported language codes"""
 
1
  """
2
+ Deepgram Speech-to-Text Implementation - Simple Batch Mode
3
  """
4
+ from typing import Optional, List
 
5
  from datetime import datetime
6
+ import io
7
+ import wave
8
+ import aiohttp
9
+ import json
 
 
 
 
 
 
 
 
10
  from utils.logger import log_info, log_error, log_debug, log_warning
11
  from .stt_interface import STTInterface, STTConfig, TranscriptionResult
12
 
13
 
14
  class DeepgramSTT(STTInterface):
 
 
15
  def __init__(self, api_key: str):
16
+ """
17
+ Initialize Deepgram STT
18
+ Args:
19
+ api_key: Deepgram API key
20
+ """
21
+ try:
22
+ self.api_key = api_key
23
+ self.base_url = "https://api.deepgram.com/v1/listen"
24
+
25
+ log_info("βœ… Deepgram STT initialized in batch mode")
26
+
27
+ except Exception as e:
28
+ log_error(f"❌ Failed to initialize Deepgram STT: {str(e)}")
29
+ raise
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  def _map_language_code(self, language: str) -> str:
32
  """Map language codes to Deepgram format"""
33
+ # Deepgram uses different language codes
34
  language_map = {
35
  "tr-TR": "tr",
36
  "en-US": "en-US",
 
46
  "zh-CN": "zh-CN",
47
  "ar-SA": "ar",
48
  }
49
+ return language_map.get(language, "en-US")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
+ async def transcribe(self, audio_data: bytes, config: STTConfig) -> Optional[TranscriptionResult]:
52
+ """Transcribe audio data using Deepgram API"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  try:
54
+ # Check if we have audio to transcribe
55
+ if not audio_data:
56
+ log_warning("⚠️ No audio data provided")
57
+ return None
58
+
59
+ log_info(f"πŸ“Š Transcribing {len(audio_data)} bytes of audio")
60
+
61
+ # Convert to WAV format for better compatibility
62
+ wav_audio = self._convert_to_wav(audio_data, config.sample_rate)
63
+
64
+ # Build Deepgram API parameters
65
+ language = self._map_language_code(config.language)
66
+
67
+ params = {
68
+ "language": language,
69
+ "punctuate": str(config.enable_punctuation).lower(),
70
+ "model": config.model if config.model != "latest_long" else "general",
71
+ "tier": "enhanced" if config.use_enhanced else "base",
72
+ }
73
+
74
+ # Add word timestamps if requested
75
+ if config.enable_word_timestamps:
76
+ params["utterances"] = "true"
77
+ params["words"] = "true"
78
+
79
+ # Build URL with parameters
80
+ url = f"{self.base_url}?" + "&".join([f"{k}={v}" for k, v in params.items()])
81
+
82
+ # Prepare headers
83
+ headers = {
84
+ "Authorization": f"Token {self.api_key}",
85
+ "Content-Type": "audio/wav"
86
+ }
87
+
88
+ # Make API request
89
+ log_info(f"πŸ”„ Sending audio to Deepgram API...")
90
+ async with aiohttp.ClientSession() as session:
91
+ async with session.post(url, headers=headers, data=wav_audio) as response:
92
+ if response.status == 200:
93
+ result = await response.json()
 
 
94
 
95
+ # Extract transcription from response
96
+ if result.get("results") and result["results"].get("channels"):
97
+ channel = result["results"]["channels"][0]
98
+ if channel.get("alternatives"):
99
+ alternative = channel["alternatives"][0]
100
+
101
+ # Extract word timestamps if available
102
+ word_timestamps = None
103
+ if config.enable_word_timestamps and alternative.get("words"):
104
+ word_timestamps = [
105
+ {
106
+ "word": word["word"],
107
+ "start_time": word["start"],
108
+ "end_time": word["end"]
109
+ }
110
+ for word in alternative["words"]
111
+ ]
112
+
113
+ transcription = TranscriptionResult(
114
+ text=alternative.get("transcript", ""),
115
+ confidence=alternative.get("confidence", 0.0),
116
+ timestamp=datetime.now().timestamp(),
117
+ language=language,
118
+ word_timestamps=word_timestamps
119
+ )
120
+
121
+ log_info(f"βœ… Transcription: '{transcription.text}' (confidence: {transcription.confidence:.2f})")
122
+ return transcription
123
 
124
+ log_warning("⚠️ No transcription in response")
125
+ return None
126
+ else:
127
+ error_text = await response.text()
128
+ log_error(f"❌ Deepgram API error: {response.status} - {error_text}")
129
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
 
131
  except Exception as e:
132
+ log_error(f"❌ Error during transcription: {str(e)}")
133
+ import traceback
134
+ log_error(f"Traceback: {traceback.format_exc()}")
 
135
  return None
136
+
137
+ def _convert_to_wav(self, audio_data: bytes, sample_rate: int) -> bytes:
138
+ """Convert raw PCM audio to WAV format"""
139
+ # Create WAV file in memory
140
+ wav_buffer = io.BytesIO()
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
+ with wave.open(wav_buffer, 'wb') as wav_file:
143
+ # Set WAV parameters
144
+ wav_file.setnchannels(1) # Mono
145
+ wav_file.setsampwidth(2) # 16-bit
146
+ wav_file.setframerate(sample_rate)
147
+ wav_file.writeframes(audio_data)
148
+
149
+ # Get WAV data
150
+ wav_buffer.seek(0)
151
+ return wav_buffer.read()
152
 
153
  def get_supported_languages(self) -> List[str]:
154
  """Get list of supported language codes"""