ciyidogan commited on
Commit
e90d3a0
Β·
verified Β·
1 Parent(s): c5bf788

Create stt_deepgram.py

Browse files
Files changed (1) hide show
  1. stt/stt_deepgram.py +425 -0
stt/stt_deepgram.py ADDED
@@ -0,0 +1,425 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Deepgram Speech-to-Text Implementation
3
+ """
4
+ import os
5
+ import asyncio
6
+ import websockets
7
+ import json
8
+ from typing import AsyncIterator, Optional, List, Any, Dict
9
+ from datetime import datetime
10
+ import queue
11
+ import threading
12
+ import time
13
+ import traceback
14
+ import base64
15
+ from urllib.parse import urlencode
16
+
17
+ from utils.logger import log_info, log_error, log_debug, log_warning
18
+ from .stt_interface import STTInterface, STTConfig, TranscriptionResult
19
+
20
+
21
+ class DeepgramSTT(STTInterface):
22
+ """Deepgram Speech-to-Text implementation with advanced VAD support"""
23
+
24
+ def __init__(self, api_key: str):
25
+ if not api_key:
26
+ raise ValueError("Deepgram API key is required")
27
+
28
+ self.api_key = api_key
29
+ self.websocket = None
30
+ self.is_streaming = False
31
+ self.responses_queue = queue.Queue(maxsize=100)
32
+ self.ws_thread = None
33
+ self.stop_event = threading.Event()
34
+
35
+ # Session tracking
36
+ self.session_id = 0
37
+ self.total_audio_bytes = 0
38
+ self.total_chunks = 0
39
+
40
+ # VAD tracking
41
+ self.vad_enabled = False
42
+ self.last_speech_end_time = None
43
+
44
+ log_info(f"βœ… Deepgram STT initialized")
45
+
46
+ def _get_websocket_url(self, config: STTConfig) -> str:
47
+ """Build Deepgram WebSocket URL with parameters"""
48
+ base_url = "wss://api.deepgram.com/v1/listen"
49
+
50
+ params = {
51
+ "language": config.language,
52
+ "model": "nova-2", # Use Nova-2 for best performance
53
+ "punctuate": str(config.enable_punctuation).lower(),
54
+ "interim_results": str(config.interim_results).lower(),
55
+ "utterance_end_ms": str(config.speech_timeout_ms),
56
+ "vad_events": str(config.vad_enabled).lower(),
57
+ "smart_format": "true",
58
+ "no_delay": "true", # Low latency mode
59
+ "encoding": self._map_encoding(config.encoding),
60
+ "sample_rate": str(config.sample_rate)
61
+ }
62
+
63
+ # Add endpointing for VAD support
64
+ if config.vad_enabled:
65
+ params["endpointing"] = str(config.speech_timeout_ms)
66
+
67
+ # Single utterance mode
68
+ if config.single_utterance:
69
+ params["utterance_end_ms"] = "1000" # Faster end detection for single utterance
70
+
71
+ query_string = urlencode(params)
72
+ return f"{base_url}?{query_string}"
73
+
74
+ def _map_encoding(self, encoding: str) -> str:
75
+ """Map encoding to Deepgram format"""
76
+ encoding_map = {
77
+ "WEBM_OPUS": "webm-opus",
78
+ "LINEAR16": "linear16",
79
+ "FLAC": "flac",
80
+ "MP3": "mp3",
81
+ "OGG_OPUS": "ogg-opus",
82
+ }
83
+ return encoding_map.get(encoding, "webm-opus")
84
+
85
+ async def start_streaming(self, config: STTConfig) -> None:
86
+ """Initialize streaming session with WebSocket"""
87
+ try:
88
+ # Stop any existing stream
89
+ if self.is_streaming or self.ws_thread:
90
+ log_warning("⚠️ Previous stream still active, stopping it first")
91
+ await self.stop_streaming()
92
+ await asyncio.sleep(0.5)
93
+
94
+ # Reset session data
95
+ self._reset_session_data()
96
+
97
+ log_info(f"🎀 Starting Deepgram STT streaming session #{self.session_id}")
98
+ log_debug(f"Config: language={config.language}, vad={config.vad_enabled}, interim={config.interim_results}")
99
+
100
+ # Clear stop event
101
+ self.stop_event.clear()
102
+
103
+ # Store config
104
+ self.config = config
105
+ self.vad_enabled = config.vad_enabled
106
+
107
+ # Start WebSocket thread
108
+ self.is_streaming = True
109
+ self.ws_thread = threading.Thread(
110
+ target=self._run_websocket,
111
+ args=(config,),
112
+ name=f"DeepgramSTT-Session-{self.session_id}"
113
+ )
114
+ self.ws_thread.daemon = True
115
+ self.ws_thread.start()
116
+
117
+ # Wait a bit for connection
118
+ await asyncio.sleep(0.5)
119
+
120
+ if not self.is_streaming:
121
+ raise RuntimeError("Failed to establish WebSocket connection")
122
+
123
+ log_info(f"βœ… Deepgram STT streaming session #{self.session_id} started successfully")
124
+
125
+ except Exception as e:
126
+ log_error(f"❌ Failed to start Deepgram STT streaming", error=str(e))
127
+ self.is_streaming = False
128
+ self.websocket = None
129
+ raise
130
+
131
+ async def stream_audio(self, audio_chunk: bytes) -> AsyncIterator[TranscriptionResult]:
132
+ """Stream audio chunk and get transcription results"""
133
+ if not self.is_streaming:
134
+ raise RuntimeError("Streaming not started. Call start_streaming() first.")
135
+
136
+ try:
137
+ # Send audio to WebSocket
138
+ if self.websocket and not self.websocket.closed:
139
+ # Send as binary data
140
+ await asyncio.get_event_loop().run_in_executor(
141
+ None,
142
+ self._send_audio_sync,
143
+ audio_chunk
144
+ )
145
+
146
+ self.total_chunks += 1
147
+ self.total_bytes += len(audio_chunk)
148
+
149
+ # Log progress
150
+ if self.total_chunks % 50 == 0:
151
+ log_debug(f"πŸ“Š Progress: {self.total_chunks} chunks, {self.total_bytes/1024:.1f}KB total")
152
+
153
+ # Check for results
154
+ while True:
155
+ try:
156
+ result = self.responses_queue.get_nowait()
157
+ yield result
158
+ except queue.Empty:
159
+ break
160
+
161
+ except Exception as e:
162
+ log_error(f"❌ Deepgram STT streaming error", error=str(e))
163
+ self.is_streaming = False
164
+ raise
165
+
166
+ def _send_audio_sync(self, audio_chunk: bytes):
167
+ """Synchronous method to send audio"""
168
+ if self.websocket and not self.websocket.closed:
169
+ try:
170
+ asyncio.run(self.websocket.send(audio_chunk))
171
+ except Exception as e:
172
+ log_error(f"❌ Error sending audio chunk: {e}")
173
+
174
+ async def stop_streaming(self) -> Optional[TranscriptionResult]:
175
+ """Stop streaming and clean up"""
176
+ if not self.is_streaming and not self.ws_thread:
177
+ log_debug("Already stopped, nothing to do")
178
+ return None
179
+
180
+ try:
181
+ log_info(f"πŸ›‘ Stopping Deepgram STT streaming session #{self.session_id}")
182
+
183
+ # Set stop flag
184
+ self.is_streaming = False
185
+ self.stop_event.set()
186
+
187
+ # Close WebSocket
188
+ if self.websocket and not self.websocket.closed:
189
+ try:
190
+ await self.websocket.close()
191
+ except:
192
+ pass
193
+
194
+ # Wait for thread
195
+ if self.ws_thread and self.ws_thread.is_alive():
196
+ log_info("⏳ Waiting for WebSocket thread to finish...")
197
+ self.ws_thread.join(timeout=5.0)
198
+
199
+ if self.ws_thread.is_alive():
200
+ log_warning("⚠️ WebSocket thread did not stop gracefully")
201
+ else:
202
+ log_info("βœ… WebSocket thread finished")
203
+
204
+ # Get final result
205
+ final_result = None
206
+ while not self.responses_queue.empty():
207
+ try:
208
+ result = self.responses_queue.get_nowait()
209
+ if result.is_final:
210
+ final_result = result
211
+ except queue.Empty:
212
+ break
213
+
214
+ # Reset
215
+ self.websocket = None
216
+ self.ws_thread = None
217
+ self.stop_event.clear()
218
+
219
+ log_info(f"βœ… Deepgram STT streaming session #{self.session_id} stopped")
220
+ return final_result
221
+
222
+ except Exception as e:
223
+ log_error(f"❌ Error during stop_streaming", error=str(e))
224
+ self.is_streaming = False
225
+ self.websocket = None
226
+ self.ws_thread = None
227
+ return None
228
+
229
+ def _run_websocket(self, config: STTConfig):
230
+ """Run WebSocket connection in separate thread"""
231
+ asyncio.set_event_loop(asyncio.new_event_loop())
232
+ loop = asyncio.get_event_loop()
233
+
234
+ try:
235
+ loop.run_until_complete(self._websocket_handler(config))
236
+ except Exception as e:
237
+ log_error(f"❌ WebSocket thread error", error=str(e), traceback=traceback.format_exc())
238
+ finally:
239
+ loop.close()
240
+ self.is_streaming = False
241
+
242
+ async def _websocket_handler(self, config: STTConfig):
243
+ """Handle WebSocket connection and messages"""
244
+ url = self._get_websocket_url(config)
245
+ headers = {
246
+ "Authorization": f"Token {self.api_key}"
247
+ }
248
+
249
+ try:
250
+ log_info(f"πŸ”Œ Connecting to Deepgram WebSocket...")
251
+
252
+ async with websockets.connect(url, extra_headers=headers) as websocket:
253
+ self.websocket = websocket
254
+ log_info(f"βœ… Connected to Deepgram WebSocket")
255
+
256
+ # Send keep-alive and receive messages
257
+ receive_task = asyncio.create_task(self._receive_messages())
258
+ keepalive_task = asyncio.create_task(self._send_keepalive())
259
+
260
+ # Wait until stop event or connection closes
261
+ while not self.stop_event.is_set() and not websocket.closed:
262
+ await asyncio.sleep(0.1)
263
+
264
+ # Cancel tasks
265
+ receive_task.cancel()
266
+ keepalive_task.cancel()
267
+
268
+ try:
269
+ await receive_task
270
+ await keepalive_task
271
+ except asyncio.CancelledError:
272
+ pass
273
+
274
+ except Exception as e:
275
+ log_error(f"❌ WebSocket connection error", error=str(e))
276
+ self.is_streaming = False
277
+
278
+ async def _receive_messages(self):
279
+ """Receive and process messages from WebSocket"""
280
+ try:
281
+ async for message in self.websocket:
282
+ if self.stop_event.is_set():
283
+ break
284
+
285
+ try:
286
+ data = json.loads(message)
287
+ self._process_deepgram_message(data)
288
+ except json.JSONDecodeError as e:
289
+ log_error(f"❌ Failed to parse message: {e}")
290
+
291
+ except websockets.exceptions.ConnectionClosed:
292
+ log_info("WebSocket connection closed")
293
+ except Exception as e:
294
+ log_error(f"❌ Error receiving messages: {e}")
295
+
296
+ async def _send_keepalive(self):
297
+ """Send keepalive messages to maintain connection"""
298
+ try:
299
+ while not self.stop_event.is_set():
300
+ if self.websocket and not self.websocket.closed:
301
+ await self.websocket.send(json.dumps({"type": "KeepAlive"}))
302
+ await asyncio.sleep(8) # Deepgram requires keepalive every 10s
303
+ except Exception as e:
304
+ log_debug(f"Keepalive stopped: {e}")
305
+
306
+ def _process_deepgram_message(self, data: Dict[str, Any]):
307
+ """Process Deepgram response message"""
308
+ msg_type = data.get("type", "")
309
+
310
+ if msg_type == "Results":
311
+ # Transcription result
312
+ channel = data.get("channel", {})
313
+ alternatives = channel.get("alternatives", [])
314
+
315
+ if alternatives:
316
+ alt = alternatives[0]
317
+ transcript = alt.get("transcript", "")
318
+ confidence = alt.get("confidence", 0.0)
319
+ is_final = data.get("is_final", False)
320
+
321
+ # Skip empty transcripts unless it's a final result
322
+ if transcript.strip() or is_final:
323
+ result = TranscriptionResult(
324
+ text=transcript,
325
+ is_final=is_final,
326
+ confidence=confidence,
327
+ timestamp=datetime.now().timestamp()
328
+ )
329
+
330
+ # Queue result
331
+ try:
332
+ self.responses_queue.put(result)
333
+
334
+ if is_final:
335
+ log_info(f"🎯 FINAL: '{transcript}'")
336
+ else:
337
+ log_debug(f"πŸ“ Interim: '{transcript}'")
338
+
339
+ except queue.Full:
340
+ log_warning("⚠️ Response queue full")
341
+
342
+ elif msg_type == "SpeechStarted":
343
+ # VAD: Speech started
344
+ log_debug("🎀 VAD: Speech started")
345
+
346
+ elif msg_type == "UtteranceEnd":
347
+ # VAD: Utterance ended
348
+ log_debug("πŸ”š VAD: Utterance ended")
349
+ self.last_speech_end_time = datetime.now()
350
+
351
+ # For single utterance mode, this signals end
352
+ if hasattr(self, 'config') and self.config.single_utterance:
353
+ log_info("βœ… Single utterance completed - VAD triggered")
354
+
355
+ elif msg_type == "Error":
356
+ # Error message
357
+ error = data.get("error", {})
358
+ log_error(f"❌ Deepgram error: {error}")
359
+
360
+ elif msg_type == "Metadata":
361
+ # Connection metadata
362
+ log_debug(f"Metadata: {data}")
363
+
364
+ def _reset_session_data(self):
365
+ """Reset session-specific data"""
366
+ # Clear queue
367
+ while not self.responses_queue.empty():
368
+ try:
369
+ self.responses_queue.get_nowait()
370
+ except:
371
+ pass
372
+
373
+ # Reset counters
374
+ self.total_audio_bytes = 0
375
+ self.total_chunks = 0
376
+ self.session_id += 1
377
+ self.last_speech_end_time = None
378
+
379
+ log_info(f"πŸ”„ Deepgram STT session data reset. New session ID: {self.session_id}")
380
+
381
+ def supports_realtime(self) -> bool:
382
+ """Deepgram supports real-time streaming"""
383
+ return True
384
+
385
+ def get_supported_languages(self) -> List[str]:
386
+ """Get list of supported language codes"""
387
+ # Deepgram supports 36+ languages with Nova-2
388
+ return [
389
+ "tr", # Turkish
390
+ "en", # English
391
+ "en-US", # English (US)
392
+ "en-GB", # English (UK)
393
+ "de", # German
394
+ "fr", # French
395
+ "es", # Spanish
396
+ "it", # Italian
397
+ "pt", # Portuguese
398
+ "ru", # Russian
399
+ "ja", # Japanese
400
+ "ko", # Korean
401
+ "zh", # Chinese
402
+ "ar", # Arabic
403
+ "nl", # Dutch
404
+ "sv", # Swedish
405
+ "pl", # Polish
406
+ "hi", # Hindi
407
+ "cs", # Czech
408
+ "da", # Danish
409
+ "fi", # Finnish
410
+ "el", # Greek
411
+ "he", # Hebrew
412
+ "hu", # Hungarian
413
+ "id", # Indonesian
414
+ "ms", # Malay
415
+ "no", # Norwegian
416
+ "ro", # Romanian
417
+ "sk", # Slovak
418
+ "th", # Thai
419
+ "uk", # Ukrainian
420
+ "vi", # Vietnamese
421
+ ]
422
+
423
+ def get_provider_name(self) -> str:
424
+ """Get provider name"""
425
+ return "deepgram"