Spaces:

UcsTurkey
/

flare

Building

App Files Files Community

ciyidogan commited on 11 days ago

Commit

709d8e0

verified ·

1 Parent(s): 6e51075

Update stt/stt_deepgram.py

Browse files

Files changed (1) hide show

stt/stt_deepgram.py +96 -121

stt/stt_deepgram.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-Deepgram Speech-to-Text Implementation
 """
 import os
 import asyncio
@@ -19,7 +19,7 @@ from .stt_interface import STTInterface, STTConfig, TranscriptionResult
 class DeepgramSTT(STTInterface):
-    """Deepgram Speech-to-Text implementation with advanced VAD support"""
     def __init__(self, api_key: str):
         if not api_key:
@@ -37,53 +37,45 @@ class DeepgramSTT(STTInterface):
         self.total_audio_bytes = 0
         self.total_chunks = 0
-        # VAD tracking
-        self.vad_enabled = False
-        self.last_speech_end_time = None
-        log_info(f"✅ Deepgram STT initialized")
     def _get_websocket_url(self, config: STTConfig) -> str:
-        """Build Deepgram WebSocket URL with parameters"""
         base_url = "wss://api.deepgram.com/v1/listen"
         params = {
-            "language": config.language,
-            "model": "nova-2",  # Use Nova-2 for best performance
-            "punctuate": str(config.enable_punctuation).lower(),
-            "interim_results": str(config.interim_results).lower(),
-            "utterance_end_ms": str(config.speech_timeout_ms),
-            "vad_events": str(config.vad_enabled).lower(),
-            "smart_format": "true",
-            "no_delay": "true",  # Low latency mode
-            "encoding": self._map_encoding(config.encoding),
-            "sample_rate": str(config.sample_rate)
         }
-        # Add endpointing for VAD support
-        if config.vad_enabled:
-            params["endpointing"] = str(config.speech_timeout_ms)
-        # Single utterance mode
-        if config.single_utterance:
-            params["utterance_end_ms"] = "1000"  # Faster end detection for single utterance
         query_string = urlencode(params)
         return f"{base_url}?{query_string}"
-    def _map_encoding(self, encoding: str) -> str:
-        """Map encoding to Deepgram format"""
-        encoding_map = {
-            "WEBM_OPUS": "webm-opus",
-            "LINEAR16": "linear16",
-            "FLAC": "flac",
-            "MP3": "mp3",
-            "OGG_OPUS": "ogg-opus",
-        }
-        return encoding_map.get(encoding, "webm-opus")
     async def start_streaming(self, config: STTConfig) -> None:
-        """Initialize streaming session with WebSocket"""
         try:
             # Stop any existing stream
             if self.is_streaming or self.ws_thread:
@@ -94,45 +86,50 @@ class DeepgramSTT(STTInterface):
             # Reset session data
             self._reset_session_data()
-            log_info(f"🎤 Starting Deepgram STT streaming session #{self.session_id}")
-            log_debug(f"Config: language={config.language}, vad={config.vad_enabled}, interim={config.interim_results}")
             # Clear stop event
             self.stop_event.clear()
             # Store config
             self.config = config
-            self.vad_enabled = config.vad_enabled
             # Start WebSocket thread
             self.is_streaming = True
             self.ws_thread = threading.Thread(
                 target=self._run_websocket,
                 args=(config,),
-                name=f"DeepgramSTT-Session-{self.session_id}"
             )
             self.ws_thread.daemon = True
             self.ws_thread.start()
-            # Wait a bit for connection
             await asyncio.sleep(0.5)
             if not self.is_streaming:
                 raise RuntimeError("Failed to establish WebSocket connection")
-            log_info(f"✅ Deepgram STT streaming session #{self.session_id} started successfully")
         except Exception as e:
-            log_error(f"❌ Failed to start Deepgram STT streaming", error=str(e))
             self.is_streaming = False
             self.websocket = None
             raise
     async def stream_audio(self, audio_chunk: bytes) -> AsyncIterator[TranscriptionResult]:
-        """Stream audio chunk and get transcription results"""
         if not self.is_streaming:
             raise RuntimeError("Streaming not started. Call start_streaming() first.")
         try:
             # Send audio to WebSocket
             if self.websocket and not self.websocket.closed:
@@ -146,15 +143,17 @@ class DeepgramSTT(STTInterface):
                 self.total_chunks += 1
                 self.total_bytes += len(audio_chunk)
-                # Log progress
                 if self.total_chunks % 50 == 0:
-                    log_debug(f"📊 Progress: {self.total_chunks} chunks, {self.total_bytes/1024:.1f}KB total")
-            # Check for results
             while True:
                 try:
                     result = self.responses_queue.get_nowait()
-                    yield result
                 except queue.Empty:
                     break
@@ -165,43 +164,46 @@ class DeepgramSTT(STTInterface):
     def _send_audio_sync(self, audio_chunk: bytes):
         """Synchronous method to send audio"""
-        if self.websocket and not self.websocket.closed:
             try:
                 asyncio.run(self.websocket.send(audio_chunk))
             except Exception as e:
                 log_error(f"❌ Error sending audio chunk: {e}")
     async def stop_streaming(self) -> Optional[TranscriptionResult]:
-        """Stop streaming and clean up"""
         if not self.is_streaming and not self.ws_thread:
             log_debug("Already stopped, nothing to do")
             return None
         try:
-            log_info(f"🛑 Stopping Deepgram STT streaming session #{self.session_id}")
             # Set stop flag
             self.is_streaming = False
             self.stop_event.set()
-            # Close WebSocket
             if self.websocket and not self.websocket.closed:
                 try:
                     await self.websocket.close()
                 except:
                     pass
             # Wait for thread
             if self.ws_thread and self.ws_thread.is_alive():
-                log_info("⏳ Waiting for WebSocket thread to finish...")
-                self.ws_thread.join(timeout=5.0)
                 if self.ws_thread.is_alive():
                     log_warning("⚠️ WebSocket thread did not stop gracefully")
                 else:
-                    log_info("✅ WebSocket thread finished")
-            # Get final result
             final_result = None
             while not self.responses_queue.empty():
                 try:
@@ -211,12 +213,13 @@ class DeepgramSTT(STTInterface):
                 except queue.Empty:
                     break
-            # Reset
             self.websocket = None
             self.ws_thread = None
             self.stop_event.clear()
-            log_info(f"✅ Deepgram STT streaming session #{self.session_id} stopped")
             return final_result
         except Exception as e:
@@ -247,27 +250,24 @@ class DeepgramSTT(STTInterface):
         }
         try:
-            log_info(f"🔌 Connecting to Deepgram WebSocket...")
-            async with websockets.connect(url, extra_headers=headers) as websocket:
                 self.websocket = websocket
-                log_info(f"✅ Connected to Deepgram WebSocket")
-                # Send keep-alive and receive messages
                 receive_task = asyncio.create_task(self._receive_messages())
-                keepalive_task = asyncio.create_task(self._send_keepalive())
-                # Wait until stop event or connection closes
-                while not self.stop_event.is_set() and not websocket.closed:
                     await asyncio.sleep(0.1)
-                # Cancel tasks
                 receive_task.cancel()
-                keepalive_task.cancel()
                 try:
                     await receive_task
-                    await keepalive_task
                 except asyncio.CancelledError:
                     pass
@@ -279,7 +279,7 @@ class DeepgramSTT(STTInterface):
         """Receive and process messages from WebSocket"""
         try:
             async for message in self.websocket:
-                if self.stop_event.is_set():
                     break
                 try:
@@ -293,36 +293,28 @@ class DeepgramSTT(STTInterface):
         except Exception as e:
             log_error(f"❌ Error receiving messages: {e}")
-    async def _send_keepalive(self):
-        """Send keepalive messages to maintain connection"""
-        try:
-            while not self.stop_event.is_set():
-                if self.websocket and not self.websocket.closed:
-                    await self.websocket.send(json.dumps({"type": "KeepAlive"}))
-                await asyncio.sleep(8)  # Deepgram requires keepalive every 10s
-        except Exception as e:
-            log_debug(f"Keepalive stopped: {e}")
     def _process_deepgram_message(self, data: Dict[str, Any]):
         """Process Deepgram response message"""
         msg_type = data.get("type", "")
         if msg_type == "Results":
             # Transcription result
-            channel = data.get("channel", {})
-            alternatives = channel.get("alternatives", [])
-            if alternatives:
-                alt = alternatives[0]
-                transcript = alt.get("transcript", "")
-                confidence = alt.get("confidence", 0.0)
-                is_final = data.get("is_final", False)
-                # Skip empty transcripts unless it's a final result
-                if transcript.strip() or is_final:
                     result = TranscriptionResult(
                         text=transcript,
-                        is_final=is_final,
                         confidence=confidence,
                         timestamp=datetime.now().timestamp()
                     )
@@ -330,28 +322,24 @@ class DeepgramSTT(STTInterface):
                     # Queue result
                     try:
                         self.responses_queue.put(result)
-                        if is_final:
-                            log_info(f"🎯 FINAL: '{transcript}'")
-                        else:
-                            log_debug(f"📝 Interim: '{transcript}'")
                     except queue.Full:
                         log_warning("⚠️ Response queue full")
         elif msg_type == "SpeechStarted":
             # VAD: Speech started
-            log_debug("🎤 VAD: Speech started")
         elif msg_type == "UtteranceEnd":
-            # VAD: Utterance ended
-            log_debug("🔚 VAD: Utterance ended")
-            self.last_speech_end_time = datetime.now()
-            # For single utterance mode, this signals end
-            if hasattr(self, 'config') and self.config.single_utterance:
-                log_info("✅ Single utterance completed - VAD triggered")
         elif msg_type == "Error":
             # Error message
             error = data.get("error", {})
@@ -359,7 +347,8 @@ class DeepgramSTT(STTInterface):
         elif msg_type == "Metadata":
             # Connection metadata
-            log_debug(f"Metadata: {data}")
     def _reset_session_data(self):
         """Reset session-specific data"""
@@ -374,9 +363,9 @@ class DeepgramSTT(STTInterface):
         self.total_audio_bytes = 0
         self.total_chunks = 0
         self.session_id += 1
-        self.last_speech_end_time = None
-        log_info(f"🔄 Deepgram STT session data reset. New session ID: {self.session_id}")
     def supports_realtime(self) -> bool:
         """Deepgram supports real-time streaming"""
@@ -384,7 +373,7 @@ class DeepgramSTT(STTInterface):
     def get_supported_languages(self) -> List[str]:
         """Get list of supported language codes"""
-        # Deepgram supports 36+ languages with Nova-2
         return [
             "tr",      # Turkish
             "en",      # English
@@ -404,20 +393,6 @@ class DeepgramSTT(STTInterface):
             "sv",      # Swedish
             "pl",      # Polish
             "hi",      # Hindi
-            "cs",      # Czech
-            "da",      # Danish
-            "fi",      # Finnish
-            "el",      # Greek
-            "he",      # Hebrew
-            "hu",      # Hungarian
-            "id",      # Indonesian
-            "ms",      # Malay
-            "no",      # Norwegian
-            "ro",      # Romanian
-            "sk",      # Slovak
-            "th",      # Thai
-            "uk",      # Ukrainian
-            "vi",      # Vietnamese
         ]
     def get_provider_name(self) -> str:

 """
+Deepgram Speech-to-Text Implementation - Optimized for Voice Agent
 """
 import os
 import asyncio
 class DeepgramSTT(STTInterface):
+    """Deepgram STT - Single utterance mode with VAD"""
     def __init__(self, api_key: str):
         if not api_key:
         self.total_audio_bytes = 0
         self.total_chunks = 0
+        # Final result tracking
+        self.final_result_received = False
+        log_info(f"✅ Deepgram STT initialized for single utterance mode")
     def _get_websocket_url(self, config: STTConfig) -> str:
+        """Build Deepgram WebSocket URL with optimized parameters"""
         base_url = "wss://api.deepgram.com/v1/listen"
+        # Manuel olarak optimize edilmiş parametreler
         params = {
+            "language": config.language,  # Dil config'den alınır
+            "model": "nova-2",           # En iyi model
+            "punctuate": "true",         # Noktalama işaretleri açık
+            "interim_results": "false",   # ❌ Interim results KAPALI
+            "utterance_end_ms": "1000",   # 1 saniye sessizlik = konuşma sonu
+            "vad_events": "true",         # VAD events AÇIK
+            "smart_format": "true",       # Akıllı formatlama
+            "no_delay": "true",          # Düşük gecikme modu
+            "encoding": "webm-opus",      # WebM Opus encoding
+            "sample_rate": "16000",       # 16kHz sample rate
+            "endpointing": "1000",        # 1 saniye endpointing
+            "diarize": "false",          # Speaker diarization kapalı
+            "multichannel": "false",      # Tek kanal
+            "alternatives": "1",          # Sadece en iyi alternatif
+            "profanity_filter": "false",  # Küfür filtresi kapalı
+            "redact": "false",           # Redaction kapalı
+            "replace": "false",          # Replace kapalı
+            "search": "false",           # Search kapalı
+            "keywords": "false",         # Keywords kapalı
+            "filler_words": "false",     # Filler words algılama kapalı
+            "numerals": "true"           # Sayıları rakam olarak yaz
         }
         query_string = urlencode(params)
         return f"{base_url}?{query_string}"
     async def start_streaming(self, config: STTConfig) -> None:
+        """Initialize streaming session - single utterance mode"""
         try:
             # Stop any existing stream
             if self.is_streaming or self.ws_thread:
             # Reset session data
             self._reset_session_data()
+            log_info(f"🎤 Starting Deepgram STT - Single Utterance Mode #{self.session_id}")
+            log_debug(f"Language: {config.language}, Sample Rate: 16kHz, Utterance End: 1000ms")
             # Clear stop event
             self.stop_event.clear()
+            self.final_result_received = False
             # Store config
             self.config = config
             # Start WebSocket thread
             self.is_streaming = True
             self.ws_thread = threading.Thread(
                 target=self._run_websocket,
                 args=(config,),
+                name=f"DeepgramSTT-SingleUtterance-{self.session_id}"
             )
             self.ws_thread.daemon = True
             self.ws_thread.start()
+            # Wait for connection
             await asyncio.sleep(0.5)
             if not self.is_streaming:
                 raise RuntimeError("Failed to establish WebSocket connection")
+            log_info(f"✅ Deepgram STT ready - Listening for single utterance")
         except Exception as e:
+            log_error(f"❌ Failed to start Deepgram STT", error=str(e))
             self.is_streaming = False
             self.websocket = None
             raise
     async def stream_audio(self, audio_chunk: bytes) -> AsyncIterator[TranscriptionResult]:
+        """Stream audio chunk - only returns final results"""
         if not self.is_streaming:
             raise RuntimeError("Streaming not started. Call start_streaming() first.")
+        # Eğer final result alındıysa, daha fazla audio kabul etme
+        if self.final_result_received:
+            log_debug("Final result already received, ignoring audio chunk")
+            return
         try:
             # Send audio to WebSocket
             if self.websocket and not self.websocket.closed:
                 self.total_chunks += 1
                 self.total_bytes += len(audio_chunk)
+                # Log progress every 50 chunks
                 if self.total_chunks % 50 == 0:
+                    log_debug(f"📊 Listening... {self.total_chunks} chunks, {self.total_bytes/1024:.1f}KB")
+            # Check for final results only
             while True:
                 try:
                     result = self.responses_queue.get_nowait()
+                    # Sadece final result'ları yield et
+                    if result.is_final:
+                        yield result
                 except queue.Empty:
                     break
     def _send_audio_sync(self, audio_chunk: bytes):
         """Synchronous method to send audio"""
+        if self.websocket and not self.websocket.closed and not self.final_result_received:
             try:
                 asyncio.run(self.websocket.send(audio_chunk))
             except Exception as e:
                 log_error(f"❌ Error sending audio chunk: {e}")
     async def stop_streaming(self) -> Optional[TranscriptionResult]:
+        """Stop streaming and dispose"""
         if not self.is_streaming and not self.ws_thread:
             log_debug("Already stopped, nothing to do")
             return None
         try:
+            log_info(f"🛑 Disposing Deepgram STT session #{self.session_id}")
             # Set stop flag
             self.is_streaming = False
             self.stop_event.set()
+            # Close WebSocket with close frame
             if self.websocket and not self.websocket.closed:
                 try:
+                    # Send close frame to trigger final response
+                    await self.websocket.send(json.dumps({"type": "CloseStream"}))
+                    await asyncio.sleep(0.2)  # Wait for final response
                     await self.websocket.close()
                 except:
                     pass
             # Wait for thread
             if self.ws_thread and self.ws_thread.is_alive():
+                log_debug("⏳ Waiting for WebSocket thread to finish...")
+                self.ws_thread.join(timeout=3.0)
                 if self.ws_thread.is_alive():
                     log_warning("⚠️ WebSocket thread did not stop gracefully")
                 else:
+                    log_debug("✅ WebSocket thread finished")
+            # Get the final result
             final_result = None
             while not self.responses_queue.empty():
                 try:
                 except queue.Empty:
                     break
+            # Reset everything
             self.websocket = None
             self.ws_thread = None
             self.stop_event.clear()
+            self.final_result_received = False
+            log_info(f"✅ Deepgram STT session #{self.session_id} disposed")
             return final_result
         except Exception as e:
         }
         try:
+            log_debug(f"🔌 Connecting to Deepgram WebSocket...")
+            async with websockets.connect(url, extra_headers=headers, ping_interval=5) as websocket:
                 self.websocket = websocket
+                log_info(f"✅ Connected to Deepgram - Ready for speech")
+                # Receive messages task only (no keepalive needed for short sessions)
                 receive_task = asyncio.create_task(self._receive_messages())
+                # Wait until stop event, final result, or connection closes
+                while not self.stop_event.is_set() and not websocket.closed and not self.final_result_received:
                     await asyncio.sleep(0.1)
+                # Cancel task
                 receive_task.cancel()
                 try:
                     await receive_task
                 except asyncio.CancelledError:
                     pass
         """Receive and process messages from WebSocket"""
         try:
             async for message in self.websocket:
+                if self.stop_event.is_set() or self.final_result_received:
                     break
                 try:
         except Exception as e:
             log_error(f"❌ Error receiving messages: {e}")
     def _process_deepgram_message(self, data: Dict[str, Any]):
         """Process Deepgram response message"""
         msg_type = data.get("type", "")
         if msg_type == "Results":
             # Transcription result
+            is_final = data.get("is_final", False)
+            # Sadece final result'ları işle
+            if is_final:
+                channel = data.get("channel", {})
+                alternatives = channel.get("alternatives", [])
+                if alternatives:
+                    alt = alternatives[0]
+                    transcript = alt.get("transcript", "")
+                    confidence = alt.get("confidence", 0.0)
+                    # Create final result
                     result = TranscriptionResult(
                         text=transcript,
+                        is_final=True,
                         confidence=confidence,
                         timestamp=datetime.now().timestamp()
                     )
                     # Queue result
                     try:
                         self.responses_queue.put(result)
+                        self.final_result_received = True
+                        log_info(f"🎯 FINAL RESULT: '{transcript}' (confidence: {confidence:.2f})")
+                        log_info(f"📊 Session stats: {self.total_chunks} chunks, {self.total_bytes/1024:.1f}KB")
                     except queue.Full:
                         log_warning("⚠️ Response queue full")
         elif msg_type == "SpeechStarted":
             # VAD: Speech started
+            log_info("🎤 Speech detected - User started speaking")
         elif msg_type == "UtteranceEnd":
+            # VAD: Utterance ended - kullanıcı konuşmayı bitirdi
+            log_info("🔚 Speech ended - User stopped speaking")
+            # Bu noktada Deepgram final result gönderecek
         elif msg_type == "Error":
             # Error message
             error = data.get("error", {})
         elif msg_type == "Metadata":
             # Connection metadata
+            request_id = data.get("request_id", "")
+            log_debug(f"📋 Connected with request_id: {request_id}")
     def _reset_session_data(self):
         """Reset session-specific data"""
         self.total_audio_bytes = 0
         self.total_chunks = 0
         self.session_id += 1
+        self.final_result_received = False
+        log_debug(f"🔄 Session data reset. New session ID: {self.session_id}")
     def supports_realtime(self) -> bool:
         """Deepgram supports real-time streaming"""
     def get_supported_languages(self) -> List[str]:
         """Get list of supported language codes"""
+        # Deepgram Nova-2 supported languages
         return [
             "tr",      # Turkish
             "en",      # English
             "sv",      # Swedish
             "pl",      # Polish
             "hi",      # Hindi
         ]
     def get_provider_name(self) -> str: