ciyidogan commited on
Commit
709d8e0
·
verified ·
1 Parent(s): 6e51075

Update stt/stt_deepgram.py

Browse files
Files changed (1) hide show
  1. stt/stt_deepgram.py +96 -121
stt/stt_deepgram.py CHANGED
@@ -1,5 +1,5 @@
1
  """
2
- Deepgram Speech-to-Text Implementation
3
  """
4
  import os
5
  import asyncio
@@ -19,7 +19,7 @@ from .stt_interface import STTInterface, STTConfig, TranscriptionResult
19
 
20
 
21
  class DeepgramSTT(STTInterface):
22
- """Deepgram Speech-to-Text implementation with advanced VAD support"""
23
 
24
  def __init__(self, api_key: str):
25
  if not api_key:
@@ -37,53 +37,45 @@ class DeepgramSTT(STTInterface):
37
  self.total_audio_bytes = 0
38
  self.total_chunks = 0
39
 
40
- # VAD tracking
41
- self.vad_enabled = False
42
- self.last_speech_end_time = None
43
 
44
- log_info(f"✅ Deepgram STT initialized")
45
 
46
  def _get_websocket_url(self, config: STTConfig) -> str:
47
- """Build Deepgram WebSocket URL with parameters"""
48
  base_url = "wss://api.deepgram.com/v1/listen"
49
 
 
50
  params = {
51
- "language": config.language,
52
- "model": "nova-2", # Use Nova-2 for best performance
53
- "punctuate": str(config.enable_punctuation).lower(),
54
- "interim_results": str(config.interim_results).lower(),
55
- "utterance_end_ms": str(config.speech_timeout_ms),
56
- "vad_events": str(config.vad_enabled).lower(),
57
- "smart_format": "true",
58
- "no_delay": "true", # Low latency mode
59
- "encoding": self._map_encoding(config.encoding),
60
- "sample_rate": str(config.sample_rate)
 
 
 
 
 
 
 
 
 
 
 
61
  }
62
 
63
- # Add endpointing for VAD support
64
- if config.vad_enabled:
65
- params["endpointing"] = str(config.speech_timeout_ms)
66
-
67
- # Single utterance mode
68
- if config.single_utterance:
69
- params["utterance_end_ms"] = "1000" # Faster end detection for single utterance
70
-
71
  query_string = urlencode(params)
72
  return f"{base_url}?{query_string}"
73
-
74
- def _map_encoding(self, encoding: str) -> str:
75
- """Map encoding to Deepgram format"""
76
- encoding_map = {
77
- "WEBM_OPUS": "webm-opus",
78
- "LINEAR16": "linear16",
79
- "FLAC": "flac",
80
- "MP3": "mp3",
81
- "OGG_OPUS": "ogg-opus",
82
- }
83
- return encoding_map.get(encoding, "webm-opus")
84
 
85
  async def start_streaming(self, config: STTConfig) -> None:
86
- """Initialize streaming session with WebSocket"""
87
  try:
88
  # Stop any existing stream
89
  if self.is_streaming or self.ws_thread:
@@ -94,45 +86,50 @@ class DeepgramSTT(STTInterface):
94
  # Reset session data
95
  self._reset_session_data()
96
 
97
- log_info(f"🎤 Starting Deepgram STT streaming session #{self.session_id}")
98
- log_debug(f"Config: language={config.language}, vad={config.vad_enabled}, interim={config.interim_results}")
99
 
100
  # Clear stop event
101
  self.stop_event.clear()
 
102
 
103
  # Store config
104
  self.config = config
105
- self.vad_enabled = config.vad_enabled
106
 
107
  # Start WebSocket thread
108
  self.is_streaming = True
109
  self.ws_thread = threading.Thread(
110
  target=self._run_websocket,
111
  args=(config,),
112
- name=f"DeepgramSTT-Session-{self.session_id}"
113
  )
114
  self.ws_thread.daemon = True
115
  self.ws_thread.start()
116
 
117
- # Wait a bit for connection
118
  await asyncio.sleep(0.5)
119
 
120
  if not self.is_streaming:
121
  raise RuntimeError("Failed to establish WebSocket connection")
122
 
123
- log_info(f"✅ Deepgram STT streaming session #{self.session_id} started successfully")
124
 
125
  except Exception as e:
126
- log_error(f"❌ Failed to start Deepgram STT streaming", error=str(e))
127
  self.is_streaming = False
128
  self.websocket = None
129
  raise
130
 
131
  async def stream_audio(self, audio_chunk: bytes) -> AsyncIterator[TranscriptionResult]:
132
- """Stream audio chunk and get transcription results"""
133
  if not self.is_streaming:
134
  raise RuntimeError("Streaming not started. Call start_streaming() first.")
135
 
 
 
 
 
 
136
  try:
137
  # Send audio to WebSocket
138
  if self.websocket and not self.websocket.closed:
@@ -146,15 +143,17 @@ class DeepgramSTT(STTInterface):
146
  self.total_chunks += 1
147
  self.total_bytes += len(audio_chunk)
148
 
149
- # Log progress
150
  if self.total_chunks % 50 == 0:
151
- log_debug(f"📊 Progress: {self.total_chunks} chunks, {self.total_bytes/1024:.1f}KB total")
152
 
153
- # Check for results
154
  while True:
155
  try:
156
  result = self.responses_queue.get_nowait()
157
- yield result
 
 
158
  except queue.Empty:
159
  break
160
 
@@ -165,43 +164,46 @@ class DeepgramSTT(STTInterface):
165
 
166
  def _send_audio_sync(self, audio_chunk: bytes):
167
  """Synchronous method to send audio"""
168
- if self.websocket and not self.websocket.closed:
169
  try:
170
  asyncio.run(self.websocket.send(audio_chunk))
171
  except Exception as e:
172
  log_error(f"❌ Error sending audio chunk: {e}")
173
 
174
  async def stop_streaming(self) -> Optional[TranscriptionResult]:
175
- """Stop streaming and clean up"""
176
  if not self.is_streaming and not self.ws_thread:
177
  log_debug("Already stopped, nothing to do")
178
  return None
179
 
180
  try:
181
- log_info(f"🛑 Stopping Deepgram STT streaming session #{self.session_id}")
182
 
183
  # Set stop flag
184
  self.is_streaming = False
185
  self.stop_event.set()
186
 
187
- # Close WebSocket
188
  if self.websocket and not self.websocket.closed:
189
  try:
 
 
 
190
  await self.websocket.close()
191
  except:
192
  pass
193
 
194
  # Wait for thread
195
  if self.ws_thread and self.ws_thread.is_alive():
196
- log_info("⏳ Waiting for WebSocket thread to finish...")
197
- self.ws_thread.join(timeout=5.0)
198
 
199
  if self.ws_thread.is_alive():
200
  log_warning("⚠️ WebSocket thread did not stop gracefully")
201
  else:
202
- log_info("✅ WebSocket thread finished")
203
 
204
- # Get final result
205
  final_result = None
206
  while not self.responses_queue.empty():
207
  try:
@@ -211,12 +213,13 @@ class DeepgramSTT(STTInterface):
211
  except queue.Empty:
212
  break
213
 
214
- # Reset
215
  self.websocket = None
216
  self.ws_thread = None
217
  self.stop_event.clear()
 
218
 
219
- log_info(f"✅ Deepgram STT streaming session #{self.session_id} stopped")
220
  return final_result
221
 
222
  except Exception as e:
@@ -247,27 +250,24 @@ class DeepgramSTT(STTInterface):
247
  }
248
 
249
  try:
250
- log_info(f"🔌 Connecting to Deepgram WebSocket...")
251
 
252
- async with websockets.connect(url, extra_headers=headers) as websocket:
253
  self.websocket = websocket
254
- log_info(f"✅ Connected to Deepgram WebSocket")
255
 
256
- # Send keep-alive and receive messages
257
  receive_task = asyncio.create_task(self._receive_messages())
258
- keepalive_task = asyncio.create_task(self._send_keepalive())
259
 
260
- # Wait until stop event or connection closes
261
- while not self.stop_event.is_set() and not websocket.closed:
262
  await asyncio.sleep(0.1)
263
 
264
- # Cancel tasks
265
  receive_task.cancel()
266
- keepalive_task.cancel()
267
 
268
  try:
269
  await receive_task
270
- await keepalive_task
271
  except asyncio.CancelledError:
272
  pass
273
 
@@ -279,7 +279,7 @@ class DeepgramSTT(STTInterface):
279
  """Receive and process messages from WebSocket"""
280
  try:
281
  async for message in self.websocket:
282
- if self.stop_event.is_set():
283
  break
284
 
285
  try:
@@ -293,36 +293,28 @@ class DeepgramSTT(STTInterface):
293
  except Exception as e:
294
  log_error(f"❌ Error receiving messages: {e}")
295
 
296
- async def _send_keepalive(self):
297
- """Send keepalive messages to maintain connection"""
298
- try:
299
- while not self.stop_event.is_set():
300
- if self.websocket and not self.websocket.closed:
301
- await self.websocket.send(json.dumps({"type": "KeepAlive"}))
302
- await asyncio.sleep(8) # Deepgram requires keepalive every 10s
303
- except Exception as e:
304
- log_debug(f"Keepalive stopped: {e}")
305
-
306
  def _process_deepgram_message(self, data: Dict[str, Any]):
307
  """Process Deepgram response message"""
308
  msg_type = data.get("type", "")
309
 
310
  if msg_type == "Results":
311
  # Transcription result
312
- channel = data.get("channel", {})
313
- alternatives = channel.get("alternatives", [])
314
 
315
- if alternatives:
316
- alt = alternatives[0]
317
- transcript = alt.get("transcript", "")
318
- confidence = alt.get("confidence", 0.0)
319
- is_final = data.get("is_final", False)
320
 
321
- # Skip empty transcripts unless it's a final result
322
- if transcript.strip() or is_final:
 
 
 
 
323
  result = TranscriptionResult(
324
  text=transcript,
325
- is_final=is_final,
326
  confidence=confidence,
327
  timestamp=datetime.now().timestamp()
328
  )
@@ -330,28 +322,24 @@ class DeepgramSTT(STTInterface):
330
  # Queue result
331
  try:
332
  self.responses_queue.put(result)
 
 
 
 
333
 
334
- if is_final:
335
- log_info(f"🎯 FINAL: '{transcript}'")
336
- else:
337
- log_debug(f"📝 Interim: '{transcript}'")
338
-
339
  except queue.Full:
340
  log_warning("⚠️ Response queue full")
341
 
342
  elif msg_type == "SpeechStarted":
343
  # VAD: Speech started
344
- log_debug("🎤 VAD: Speech started")
345
 
346
  elif msg_type == "UtteranceEnd":
347
- # VAD: Utterance ended
348
- log_debug("🔚 VAD: Utterance ended")
349
- self.last_speech_end_time = datetime.now()
 
350
 
351
- # For single utterance mode, this signals end
352
- if hasattr(self, 'config') and self.config.single_utterance:
353
- log_info("✅ Single utterance completed - VAD triggered")
354
-
355
  elif msg_type == "Error":
356
  # Error message
357
  error = data.get("error", {})
@@ -359,7 +347,8 @@ class DeepgramSTT(STTInterface):
359
 
360
  elif msg_type == "Metadata":
361
  # Connection metadata
362
- log_debug(f"Metadata: {data}")
 
363
 
364
  def _reset_session_data(self):
365
  """Reset session-specific data"""
@@ -374,9 +363,9 @@ class DeepgramSTT(STTInterface):
374
  self.total_audio_bytes = 0
375
  self.total_chunks = 0
376
  self.session_id += 1
377
- self.last_speech_end_time = None
378
 
379
- log_info(f"🔄 Deepgram STT session data reset. New session ID: {self.session_id}")
380
 
381
  def supports_realtime(self) -> bool:
382
  """Deepgram supports real-time streaming"""
@@ -384,7 +373,7 @@ class DeepgramSTT(STTInterface):
384
 
385
  def get_supported_languages(self) -> List[str]:
386
  """Get list of supported language codes"""
387
- # Deepgram supports 36+ languages with Nova-2
388
  return [
389
  "tr", # Turkish
390
  "en", # English
@@ -404,20 +393,6 @@ class DeepgramSTT(STTInterface):
404
  "sv", # Swedish
405
  "pl", # Polish
406
  "hi", # Hindi
407
- "cs", # Czech
408
- "da", # Danish
409
- "fi", # Finnish
410
- "el", # Greek
411
- "he", # Hebrew
412
- "hu", # Hungarian
413
- "id", # Indonesian
414
- "ms", # Malay
415
- "no", # Norwegian
416
- "ro", # Romanian
417
- "sk", # Slovak
418
- "th", # Thai
419
- "uk", # Ukrainian
420
- "vi", # Vietnamese
421
  ]
422
 
423
  def get_provider_name(self) -> str:
 
1
  """
2
+ Deepgram Speech-to-Text Implementation - Optimized for Voice Agent
3
  """
4
  import os
5
  import asyncio
 
19
 
20
 
21
  class DeepgramSTT(STTInterface):
22
+ """Deepgram STT - Single utterance mode with VAD"""
23
 
24
  def __init__(self, api_key: str):
25
  if not api_key:
 
37
  self.total_audio_bytes = 0
38
  self.total_chunks = 0
39
 
40
+ # Final result tracking
41
+ self.final_result_received = False
 
42
 
43
+ log_info(f"✅ Deepgram STT initialized for single utterance mode")
44
 
45
  def _get_websocket_url(self, config: STTConfig) -> str:
46
+ """Build Deepgram WebSocket URL with optimized parameters"""
47
  base_url = "wss://api.deepgram.com/v1/listen"
48
 
49
+ # Manuel olarak optimize edilmiş parametreler
50
  params = {
51
+ "language": config.language, # Dil config'den alınır
52
+ "model": "nova-2", # En iyi model
53
+ "punctuate": "true", # Noktalama işaretleri açık
54
+ "interim_results": "false", # ❌ Interim results KAPALI
55
+ "utterance_end_ms": "1000", # 1 saniye sessizlik = konuşma sonu
56
+ "vad_events": "true", # VAD events AÇIK
57
+ "smart_format": "true", # Akıllı formatlama
58
+ "no_delay": "true", # Düşük gecikme modu
59
+ "encoding": "webm-opus", # WebM Opus encoding
60
+ "sample_rate": "16000", # 16kHz sample rate
61
+ "endpointing": "1000", # 1 saniye endpointing
62
+ "diarize": "false", # Speaker diarization kapalı
63
+ "multichannel": "false", # Tek kanal
64
+ "alternatives": "1", # Sadece en iyi alternatif
65
+ "profanity_filter": "false", # Küfür filtresi kapalı
66
+ "redact": "false", # Redaction kapalı
67
+ "replace": "false", # Replace kapalı
68
+ "search": "false", # Search kapalı
69
+ "keywords": "false", # Keywords kapalı
70
+ "filler_words": "false", # Filler words algılama kapalı
71
+ "numerals": "true" # Sayıları rakam olarak yaz
72
  }
73
 
 
 
 
 
 
 
 
 
74
  query_string = urlencode(params)
75
  return f"{base_url}?{query_string}"
 
 
 
 
 
 
 
 
 
 
 
76
 
77
  async def start_streaming(self, config: STTConfig) -> None:
78
+ """Initialize streaming session - single utterance mode"""
79
  try:
80
  # Stop any existing stream
81
  if self.is_streaming or self.ws_thread:
 
86
  # Reset session data
87
  self._reset_session_data()
88
 
89
+ log_info(f"🎤 Starting Deepgram STT - Single Utterance Mode #{self.session_id}")
90
+ log_debug(f"Language: {config.language}, Sample Rate: 16kHz, Utterance End: 1000ms")
91
 
92
  # Clear stop event
93
  self.stop_event.clear()
94
+ self.final_result_received = False
95
 
96
  # Store config
97
  self.config = config
 
98
 
99
  # Start WebSocket thread
100
  self.is_streaming = True
101
  self.ws_thread = threading.Thread(
102
  target=self._run_websocket,
103
  args=(config,),
104
+ name=f"DeepgramSTT-SingleUtterance-{self.session_id}"
105
  )
106
  self.ws_thread.daemon = True
107
  self.ws_thread.start()
108
 
109
+ # Wait for connection
110
  await asyncio.sleep(0.5)
111
 
112
  if not self.is_streaming:
113
  raise RuntimeError("Failed to establish WebSocket connection")
114
 
115
+ log_info(f"✅ Deepgram STT ready - Listening for single utterance")
116
 
117
  except Exception as e:
118
+ log_error(f"❌ Failed to start Deepgram STT", error=str(e))
119
  self.is_streaming = False
120
  self.websocket = None
121
  raise
122
 
123
  async def stream_audio(self, audio_chunk: bytes) -> AsyncIterator[TranscriptionResult]:
124
+ """Stream audio chunk - only returns final results"""
125
  if not self.is_streaming:
126
  raise RuntimeError("Streaming not started. Call start_streaming() first.")
127
 
128
+ # Eğer final result alındıysa, daha fazla audio kabul etme
129
+ if self.final_result_received:
130
+ log_debug("Final result already received, ignoring audio chunk")
131
+ return
132
+
133
  try:
134
  # Send audio to WebSocket
135
  if self.websocket and not self.websocket.closed:
 
143
  self.total_chunks += 1
144
  self.total_bytes += len(audio_chunk)
145
 
146
+ # Log progress every 50 chunks
147
  if self.total_chunks % 50 == 0:
148
+ log_debug(f"📊 Listening... {self.total_chunks} chunks, {self.total_bytes/1024:.1f}KB")
149
 
150
+ # Check for final results only
151
  while True:
152
  try:
153
  result = self.responses_queue.get_nowait()
154
+ # Sadece final result'ları yield et
155
+ if result.is_final:
156
+ yield result
157
  except queue.Empty:
158
  break
159
 
 
164
 
165
  def _send_audio_sync(self, audio_chunk: bytes):
166
  """Synchronous method to send audio"""
167
+ if self.websocket and not self.websocket.closed and not self.final_result_received:
168
  try:
169
  asyncio.run(self.websocket.send(audio_chunk))
170
  except Exception as e:
171
  log_error(f"❌ Error sending audio chunk: {e}")
172
 
173
  async def stop_streaming(self) -> Optional[TranscriptionResult]:
174
+ """Stop streaming and dispose"""
175
  if not self.is_streaming and not self.ws_thread:
176
  log_debug("Already stopped, nothing to do")
177
  return None
178
 
179
  try:
180
+ log_info(f"🛑 Disposing Deepgram STT session #{self.session_id}")
181
 
182
  # Set stop flag
183
  self.is_streaming = False
184
  self.stop_event.set()
185
 
186
+ # Close WebSocket with close frame
187
  if self.websocket and not self.websocket.closed:
188
  try:
189
+ # Send close frame to trigger final response
190
+ await self.websocket.send(json.dumps({"type": "CloseStream"}))
191
+ await asyncio.sleep(0.2) # Wait for final response
192
  await self.websocket.close()
193
  except:
194
  pass
195
 
196
  # Wait for thread
197
  if self.ws_thread and self.ws_thread.is_alive():
198
+ log_debug("⏳ Waiting for WebSocket thread to finish...")
199
+ self.ws_thread.join(timeout=3.0)
200
 
201
  if self.ws_thread.is_alive():
202
  log_warning("⚠️ WebSocket thread did not stop gracefully")
203
  else:
204
+ log_debug("✅ WebSocket thread finished")
205
 
206
+ # Get the final result
207
  final_result = None
208
  while not self.responses_queue.empty():
209
  try:
 
213
  except queue.Empty:
214
  break
215
 
216
+ # Reset everything
217
  self.websocket = None
218
  self.ws_thread = None
219
  self.stop_event.clear()
220
+ self.final_result_received = False
221
 
222
+ log_info(f"✅ Deepgram STT session #{self.session_id} disposed")
223
  return final_result
224
 
225
  except Exception as e:
 
250
  }
251
 
252
  try:
253
+ log_debug(f"🔌 Connecting to Deepgram WebSocket...")
254
 
255
+ async with websockets.connect(url, extra_headers=headers, ping_interval=5) as websocket:
256
  self.websocket = websocket
257
+ log_info(f"✅ Connected to Deepgram - Ready for speech")
258
 
259
+ # Receive messages task only (no keepalive needed for short sessions)
260
  receive_task = asyncio.create_task(self._receive_messages())
 
261
 
262
+ # Wait until stop event, final result, or connection closes
263
+ while not self.stop_event.is_set() and not websocket.closed and not self.final_result_received:
264
  await asyncio.sleep(0.1)
265
 
266
+ # Cancel task
267
  receive_task.cancel()
 
268
 
269
  try:
270
  await receive_task
 
271
  except asyncio.CancelledError:
272
  pass
273
 
 
279
  """Receive and process messages from WebSocket"""
280
  try:
281
  async for message in self.websocket:
282
+ if self.stop_event.is_set() or self.final_result_received:
283
  break
284
 
285
  try:
 
293
  except Exception as e:
294
  log_error(f"❌ Error receiving messages: {e}")
295
 
 
 
 
 
 
 
 
 
 
 
296
  def _process_deepgram_message(self, data: Dict[str, Any]):
297
  """Process Deepgram response message"""
298
  msg_type = data.get("type", "")
299
 
300
  if msg_type == "Results":
301
  # Transcription result
302
+ is_final = data.get("is_final", False)
 
303
 
304
+ # Sadece final result'ları işle
305
+ if is_final:
306
+ channel = data.get("channel", {})
307
+ alternatives = channel.get("alternatives", [])
 
308
 
309
+ if alternatives:
310
+ alt = alternatives[0]
311
+ transcript = alt.get("transcript", "")
312
+ confidence = alt.get("confidence", 0.0)
313
+
314
+ # Create final result
315
  result = TranscriptionResult(
316
  text=transcript,
317
+ is_final=True,
318
  confidence=confidence,
319
  timestamp=datetime.now().timestamp()
320
  )
 
322
  # Queue result
323
  try:
324
  self.responses_queue.put(result)
325
+ self.final_result_received = True
326
+
327
+ log_info(f"🎯 FINAL RESULT: '{transcript}' (confidence: {confidence:.2f})")
328
+ log_info(f"📊 Session stats: {self.total_chunks} chunks, {self.total_bytes/1024:.1f}KB")
329
 
 
 
 
 
 
330
  except queue.Full:
331
  log_warning("⚠️ Response queue full")
332
 
333
  elif msg_type == "SpeechStarted":
334
  # VAD: Speech started
335
+ log_info("🎤 Speech detected - User started speaking")
336
 
337
  elif msg_type == "UtteranceEnd":
338
+ # VAD: Utterance ended - kullanıcı konuşmayı bitirdi
339
+ log_info("🔚 Speech ended - User stopped speaking")
340
+
341
+ # Bu noktada Deepgram final result gönderecek
342
 
 
 
 
 
343
  elif msg_type == "Error":
344
  # Error message
345
  error = data.get("error", {})
 
347
 
348
  elif msg_type == "Metadata":
349
  # Connection metadata
350
+ request_id = data.get("request_id", "")
351
+ log_debug(f"📋 Connected with request_id: {request_id}")
352
 
353
  def _reset_session_data(self):
354
  """Reset session-specific data"""
 
363
  self.total_audio_bytes = 0
364
  self.total_chunks = 0
365
  self.session_id += 1
366
+ self.final_result_received = False
367
 
368
+ log_debug(f"🔄 Session data reset. New session ID: {self.session_id}")
369
 
370
  def supports_realtime(self) -> bool:
371
  """Deepgram supports real-time streaming"""
 
373
 
374
  def get_supported_languages(self) -> List[str]:
375
  """Get list of supported language codes"""
376
+ # Deepgram Nova-2 supported languages
377
  return [
378
  "tr", # Turkish
379
  "en", # English
 
393
  "sv", # Swedish
394
  "pl", # Polish
395
  "hi", # Hindi
 
 
 
 
 
 
 
 
 
 
 
 
 
 
396
  ]
397
 
398
  def get_provider_name(self) -> str: