Spaces:

UcsTurkey
/

flare

Building

App Files Files Community

ciyidogan commited on 11 days ago

Commit

cc4dbca

verified ·

1 Parent(s): 40f81e2

Update stt/stt_google.py

Browse files

Files changed (1) hide show

stt/stt_google.py +66 -154

stt/stt_google.py CHANGED Viewed

@@ -4,7 +4,7 @@ Google Cloud Speech-to-Text Implementation
 import os
 import asyncio
 from typing import AsyncIterator, AsyncGenerator, Optional, List, Any
-import numpy as np  # Audio level check için
 from datetime import datetime
 import sys
 import queue
@@ -43,16 +43,15 @@ class GoogleCloudSTT(STTInterface):
         self.session_id = 0
         self.stream_start_time = None
-        # ✅ Eksik attribute'ları ekleyelim
-        self.lock = threading.Lock()  # Thread lock
-        self.single_utterance = False  # Default value
-        self.chunk_count = 0  # Audio chunk counter
-        self.total_bytes = 0  # Total bytes received
-        self.stop_event = threading.Event()  # ✅ Stop event for thread coordination
         # Set Google credentials
         if credentials_path:
-            # ConfigProvider'dan gelen path'i kullan
             if os.path.exists(credentials_path):
                 os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials_path
                 log_info(f"✅ Google credentials set from: {credentials_path}")
@@ -63,14 +62,12 @@ class GoogleCloudSTT(STTInterface):
             # Fallback to environment variable
             creds_path = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS")
             if not creds_path:
-                # Try default location
                 creds_path = "./credentials/google-service-account.json"
                 if os.path.exists(creds_path):
                     os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = creds_path
                     log_info(f"✅ Google credentials set from default: {creds_path}")
                 else:
                     raise ValueError("Google credentials not found. Please provide credentials_path")
         # Test credentials
         try:
@@ -95,19 +92,20 @@ class GoogleCloudSTT(STTInterface):
         }
         return encoding_map.get(encoding_str, speech.RecognitionConfig.AudioEncoding.WEBM_OPUS)
     async def stream_audio(self, audio_chunk: bytes) -> AsyncGenerator[TranscriptionResult, None]:
         """Stream audio chunk and get results"""
         if not self.is_streaming:
             raise Exception("Streaming not started")
         try:
-            # Audio validation and logging
             chunk_size = len(audio_chunk)
             # Log first chunk details
             if self.chunk_count == 0:
                 log_info(f"📤 First chunk - size: {chunk_size} bytes")
-                # Check for WEBM header
                 if audio_chunk.startswith(b'\x1a\x45\xdf\xa3'):
                     log_info("✅ Valid WEBM header detected")
                 else:
@@ -116,16 +114,14 @@ class GoogleCloudSTT(STTInterface):
             # Try to measure audio level (if it's raw PCM)
             try:
-                import numpy as np
-                # This might fail for WEBM, but let's try
-                audio_array = np.frombuffer(audio_chunk[-1000:], dtype=np.int16)  # Last 1000 bytes
-                if len(audio_array) > 0:
-                    rms = np.sqrt(np.mean(audio_array.astype(float) ** 2))
-                    db = 20 * np.log10(max(rms, 1) / 32768.0)
-                    if self.chunk_count % 50 == 0:
-                        log_info(f"🔊 Audio level estimate: {db:.1f} dB")
             except:
-                # Expected for WEBM format
                 pass
             # Put chunk in queue
@@ -137,8 +133,8 @@ class GoogleCloudSTT(STTInterface):
             if self.chunk_count % 50 == 0:
                 log_info(f"📤 Progress: {self.chunk_count} chunks, {self.total_bytes/1024:.1f}KB total")
-            # Check for responses with timeout
-            timeout = 0.1  # 100ms timeout for checking responses
             end_time = time.time() + timeout
             while time.time() < end_time:
@@ -147,7 +143,6 @@ class GoogleCloudSTT(STTInterface):
                     log_info(f"🎯 Got result from queue: is_final={result.is_final}, text='{result.text[:30]}...'")
                     yield result
                 except queue.Empty:
-                    # No results yet, continue
                     await asyncio.sleep(0.01)
                 except Exception as e:
                     log_error(f"Error getting result from queue: {e}")
@@ -166,18 +161,19 @@ class GoogleCloudSTT(STTInterface):
         try:
             log_info(f"🛑 Stopping Google STT streaming session #{self.session_id}")
-            # Flag'i hemen kapat
             self.is_streaming = False
             self.stop_event.set()
-            # Send poison pill to stop request generator
             if self.audio_queue:
                 try:
                     self.audio_queue.put(None)
                 except:
                     pass
-            # Thread'i durdur
             if self.stream_thread and self.stream_thread.is_alive():
                 log_info("⏳ Waiting for stream thread to finish...")
                 self.stream_thread.join(timeout=5.0)
@@ -187,18 +183,18 @@ class GoogleCloudSTT(STTInterface):
                 else:
                     log_info("✅ Stream thread finished")
-            # Final result'ı al - ✅ BURADA DÜZELTME
             final_result = None
             if self.responses_queue:
                 while not self.responses_queue.empty():
                     try:
-                        result = self.responses_queue.get_nowait()  # ✅ await değil, get_nowait()
                         if result.is_final:
                             final_result = result
-                    except queue.Empty:  # ✅ queue.Empty kullan
                         break
-            # Client'ı kapat
             if self.client:
                 try:
                     if hasattr(self.client, 'transport') and hasattr(self.client.transport, 'close'):
@@ -213,11 +209,9 @@ class GoogleCloudSTT(STTInterface):
                 finally:
                     self.client = None
-            # Queue'ları None yap
             self.audio_queue = None
             self.responses_queue = None
-            # Diğer değişkenleri resetle
             self.stream_thread = None
             self.streaming_config = None
             self.stop_event.clear()
@@ -227,7 +221,6 @@ class GoogleCloudSTT(STTInterface):
         except Exception as e:
             log_error(f"❌ Error during stop_streaming", error=str(e))
-            # Force cleanup on error
             self.is_streaming = False
             self.stream_thread = None
             self.client = None
@@ -244,19 +237,8 @@ class GoogleCloudSTT(STTInterface):
     def get_supported_languages(self) -> List[str]:
         """Get list of supported language codes"""
         return [
-            "tr-TR",  # Turkish
-            "en-US",  # English (US)
-            "en-GB",  # English (UK)
-            "de-DE",  # German
-            "fr-FR",  # French
-            "es-ES",  # Spanish
-            "it-IT",  # Italian
-            "pt-BR",  # Portuguese (Brazil)
-            "ru-RU",  # Russian
-            "ja-JP",  # Japanese
-            "ko-KR",  # Korean
-            "zh-CN",  # Chinese (Simplified)
-            "ar-SA",  # Arabic
         ]
     def get_provider_name(self) -> str:
@@ -283,21 +265,18 @@ class GoogleCloudSTT(STTInterface):
         self.error_message = None
         self.session_id += 1
         self.stream_start_time = time.time()
-        # ✅ Counter'ları sıfırla
         self.chunk_count = 0
         self.total_bytes = 0
         log_info(f"🔄 Google STT session data reset. New session ID: {self.session_id}")
-        # Create fresh queues to be extra safe
         self.audio_queue = queue.Queue()
         self.responses_queue = queue.Queue()
         log_debug("✅ Created fresh queues")
     def _create_fresh_queues(self):
         """Create fresh queue instances"""
-        # Eski queue'ları temizle
         if self.audio_queue:
             while not self.audio_queue.empty():
                 try:
@@ -312,35 +291,27 @@ class GoogleCloudSTT(STTInterface):
                 except:
                     pass
-        # Yeni queue'lar oluştur
-        self.audio_queue = queue.Queue(maxsize=1000)  # Max size ekle
         self.responses_queue = queue.Queue(maxsize=100)
         log_debug("✅ Created fresh queues")
     def _request_generator(self):
         """Generate requests for the streaming recognize API"""
-        # First request must contain only the config
         yield speech.StreamingRecognizeRequest(streaming_config=self.streaming_config)
-        # Subsequent requests should contain audio chunks
         while not self.should_stop:
             try:
-                # Get audio chunk from queue with timeout
                 audio_chunk = self.audio_queue.get(timeout=0.1)
                 if audio_chunk is None:
-                    # Poison pill received
                     log_info("📛 Poison pill received, stopping request generator")
                     break
-                # Send audio chunk
                 yield speech.StreamingRecognizeRequest(audio_content=audio_chunk)
-                self.chunk_count += 1
-                self.total_bytes += len(audio_chunk)
             except queue.Empty:
-                # No audio available, continue waiting
                 continue
             except Exception as e:
                 log_error(f"Error in request generator: {e}")
@@ -348,30 +319,27 @@ class GoogleCloudSTT(STTInterface):
         log_info(f"📊 Request generator finished. Total chunks: {self.chunk_count}, Total bytes: {self.total_bytes}")
-    async def start_streaming(self, config: dict) -> None:
         """Initialize streaming session with clean state"""
         try:
-            # Önce mevcut stream'i temizle
             if self.is_streaming or self.stream_thread:
                 log_warning("⚠️ Previous stream still active, stopping it first")
                 await self.stop_streaming()
-                # Temizlik için bekle
                 await asyncio.sleep(0.5)
-            # Session verilerini resetle ve ID'yi artır
             self._reset_session()
             self.single_utterance = config.single_utterance
             log_info(f"🎤 Starting Google STT streaming session #{self.session_id} with config: {config}")
-            # Fresh queue'lar oluştur
             self._create_fresh_queues()
-            # Stop event'i temizle
             self.stop_event.clear()
-            # Yeni client oluştur (TEK SEFER)
             self.client = speech.SpeechClient()
             log_info("✅ Created new Google Speech client")
@@ -384,20 +352,19 @@ class GoogleCloudSTT(STTInterface):
                 model=config.model,
                 use_enhanced=config.use_enhanced,
                 max_alternatives=1,
-                # Metadata for better recognition
                 metadata=speech.RecognitionMetadata(
                     interaction_type=speech.RecognitionMetadata.InteractionType.VOICE_SEARCH,
                     microphone_distance=speech.RecognitionMetadata.MicrophoneDistance.NEARFIELD,
                     recording_device_type=speech.RecognitionMetadata.RecordingDeviceType.PC,
                 )
-            )  # ✅ Parantez burada kapanmalı
             # Create streaming config with VAD
             self.streaming_config = speech.StreamingRecognitionConfig(
                 config=recognition_config,
-                interim_results=config.interim_results,  # ✅ Bu zaten True
-                single_utterance=config.single_utterance,  # ✅ Bu False
-                enable_voice_activity_events=True  # ✅ VAD events'leri açıyoruz
             )
             log_info(f"📋 Streaming config created: interim_results={config.interim_results}, "
@@ -406,12 +373,12 @@ class GoogleCloudSTT(STTInterface):
             self.is_streaming = True
-            # Start streaming thread with unique name
             self.stream_thread = threading.Thread(
                 target=self._run_stream,
                 name=f"GoogleSTT-Session-{self.session_id}"
             )
-            self.stream_thread.daemon = True  # Daemon thread olarak işaretle
             self.stream_thread.start()
             log_info(f"✅ Google STT streaming session #{self.session_id} started successfully")
@@ -420,55 +387,33 @@ class GoogleCloudSTT(STTInterface):
             log_error(f"❌ Failed to start Google STT streaming", error=str(e))
             self.is_streaming = False
             self.client = None
-            self._create_fresh_queues()  # Hata durumunda da queue'ları temizle
             raise
-    def _put_result(self, result: TranscriptionResult):
-        """Helper to put result in queue"""
-        try:
-            self.responses_queue.put(result)
-            # Debug log'u kaldırdık
-        except Exception as e:
-            log_error(f"❌ Error queuing result: {e}")
     def _run_stream(self):
         """Run the streaming recognition loop in a separate thread"""
         try:
-            log_info("🎤 Google STT stream thread started - Single utterance mode: {}".format(self.single_utterance))
             # Create request generator
             requests = self._request_generator()
             # Create streaming client
             log_info("🎤 Creating Google STT streaming client...")
-            # Set a timeout for the streaming call
-            import grpc
-            timeout = 300  # 5 minutes max for the stream
-            # Create streaming client with timeout
-            responses = self.client.streaming_recognize(
-                self.streaming_config,
-                requests,
-                timeout=timeout
-            )
-            # Set initial response timeout
-            initial_response_timeout = 30  # 30 seconds to get first response
-            stream_start = time.time()
-            got_first_response = False
-            # Track if we've received any response
             first_response_time = None
             response_count = 0
-            # Process responses with detailed logging
             for response in responses:
-                if not got_first_response:
-                    got_first_response = True
-                    elapsed = time.time() - stream_start
-                    log_info(f"✅ Got first response from Google after {elapsed:.2f}s")
                 response_count += 1
                 if first_response_time is None:
@@ -476,47 +421,14 @@ class GoogleCloudSTT(STTInterface):
                     elapsed = first_response_time - self.stream_start_time
                     log_info(f"🎉 FIRST RESPONSE from Google STT after {elapsed:.2f}s")
-                # Log every response, even if empty
-                log_info(f"📨 Google STT Response #{response_count}: has_results={len(response.results) > 0}")
-                if not response.results:
-                    log_info("📭 Empty response from Google STT (no results)")
-                    continue
-                # Log all results in detail
-                for result_idx, result in enumerate(response.results):
-                    log_info(f"📝 Result #{result_idx}: is_final={result.is_final}, "
-                            f"alternatives={len(result.alternatives)}, "
-                            f"stability={getattr(result, 'stability', 'N/A')}")
-                    if result.alternatives:
-                        best_alternative = result.alternatives[0]
-                        log_info(f"🗣️ Transcript: '{best_alternative.transcript}' "
-                                f"(confidence: {best_alternative.confidence:.3f})")
-                        # Put result in queue
-                        result_obj = TranscriptionResult(
-                            text=best_alternative.transcript,
-                            is_final=result.is_final,
-                            confidence=best_alternative.confidence,
-                            timestamp=datetime.utcnow()
-                        )
-                        self.responses_queue.put(result_obj)
-                        log_info(f"✅ Result queued: is_final={result.is_final}, text='{best_alternative.transcript[:50]}...'")
-            # Log if we exit without any responses
-            if response_count == 0:
-                log_error("❌ Google STT stream ended without ANY responses!")
-            else:
-                log_info(f"✅ Google STT stream ended normally after {response_count} responses")
-        except Exception as e:
-            log_error(f"❌ Google STT error: {e}")
-            if hasattr(e, 'details'):
-                log_error(f"Error details: {e.details}")
-            self.error_message = str(e)
-        finally:
-            log_info("🎤 Google STT stream thread ended")
-            with self.lock:
-                self.is_streaming = False

 import os
 import asyncio
 from typing import AsyncIterator, AsyncGenerator, Optional, List, Any
+import numpy as np
 from datetime import datetime
 import sys
 import queue
         self.session_id = 0
         self.stream_start_time = None
+        # Additional attributes
+        self.lock = threading.Lock()
+        self.single_utterance = False
+        self.chunk_count = 0
+        self.total_bytes = 0
+        self.stop_event = threading.Event()
         # Set Google credentials
         if credentials_path:
             if os.path.exists(credentials_path):
                 os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials_path
                 log_info(f"✅ Google credentials set from: {credentials_path}")
             # Fallback to environment variable
             creds_path = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS")
             if not creds_path:
                 creds_path = "./credentials/google-service-account.json"
                 if os.path.exists(creds_path):
                     os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = creds_path
                     log_info(f"✅ Google credentials set from default: {creds_path}")
                 else:
                     raise ValueError("Google credentials not found. Please provide credentials_path")
         # Test credentials
         try:
         }
         return encoding_map.get(encoding_str, speech.RecognitionConfig.AudioEncoding.WEBM_OPUS)
+    # Alias for compatibility
+    _get_google_encoding = _get_encoding
     async def stream_audio(self, audio_chunk: bytes) -> AsyncGenerator[TranscriptionResult, None]:
         """Stream audio chunk and get results"""
         if not self.is_streaming:
             raise Exception("Streaming not started")
         try:
             chunk_size = len(audio_chunk)
             # Log first chunk details
             if self.chunk_count == 0:
                 log_info(f"📤 First chunk - size: {chunk_size} bytes")
                 if audio_chunk.startswith(b'\x1a\x45\xdf\xa3'):
                     log_info("✅ Valid WEBM header detected")
                 else:
             # Try to measure audio level (if it's raw PCM)
             try:
+                if encoding_str == "LINEAR16":  # Only for raw PCM
+                    audio_array = np.frombuffer(audio_chunk, dtype=np.int16)
+                    if len(audio_array) > 0:
+                        rms = np.sqrt(np.mean(audio_array.astype(float) ** 2))
+                        db = 20 * np.log10(max(rms, 1) / 32768.0)
+                        if self.chunk_count % 50 == 0:
+                            log_info(f"🔊 Audio level: {db:.1f} dB")
             except:
                 pass
             # Put chunk in queue
             if self.chunk_count % 50 == 0:
                 log_info(f"📤 Progress: {self.chunk_count} chunks, {self.total_bytes/1024:.1f}KB total")
+            # Check for responses
+            timeout = 0.1
             end_time = time.time() + timeout
             while time.time() < end_time:
                     log_info(f"🎯 Got result from queue: is_final={result.is_final}, text='{result.text[:30]}...'")
                     yield result
                 except queue.Empty:
                     await asyncio.sleep(0.01)
                 except Exception as e:
                     log_error(f"Error getting result from queue: {e}")
         try:
             log_info(f"🛑 Stopping Google STT streaming session #{self.session_id}")
+            # Set flags
             self.is_streaming = False
+            self.should_stop = True
             self.stop_event.set()
+            # Send poison pill
             if self.audio_queue:
                 try:
                     self.audio_queue.put(None)
                 except:
                     pass
+            # Wait for thread
             if self.stream_thread and self.stream_thread.is_alive():
                 log_info("⏳ Waiting for stream thread to finish...")
                 self.stream_thread.join(timeout=5.0)
                 else:
                     log_info("✅ Stream thread finished")
+            # Get final result
             final_result = None
             if self.responses_queue:
                 while not self.responses_queue.empty():
                     try:
+                        result = self.responses_queue.get_nowait()
                         if result.is_final:
                             final_result = result
+                    except queue.Empty:
                         break
+            # Close client
             if self.client:
                 try:
                     if hasattr(self.client, 'transport') and hasattr(self.client.transport, 'close'):
                 finally:
                     self.client = None
+            # Reset state
             self.audio_queue = None
             self.responses_queue = None
             self.stream_thread = None
             self.streaming_config = None
             self.stop_event.clear()
         except Exception as e:
             log_error(f"❌ Error during stop_streaming", error=str(e))
             self.is_streaming = False
             self.stream_thread = None
             self.client = None
     def get_supported_languages(self) -> List[str]:
         """Get list of supported language codes"""
         return [
+            "tr-TR", "en-US", "en-GB", "de-DE", "fr-FR", "es-ES",
+            "it-IT", "pt-BR", "ru-RU", "ja-JP", "ko-KR", "zh-CN", "ar-SA"
         ]
     def get_provider_name(self) -> str:
         self.error_message = None
         self.session_id += 1
         self.stream_start_time = time.time()
         self.chunk_count = 0
         self.total_bytes = 0
         log_info(f"🔄 Google STT session data reset. New session ID: {self.session_id}")
+        # Create fresh queues
         self.audio_queue = queue.Queue()
         self.responses_queue = queue.Queue()
         log_debug("✅ Created fresh queues")
     def _create_fresh_queues(self):
         """Create fresh queue instances"""
         if self.audio_queue:
             while not self.audio_queue.empty():
                 try:
                 except:
                     pass
+        self.audio_queue = queue.Queue(maxsize=1000)
         self.responses_queue = queue.Queue(maxsize=100)
         log_debug("✅ Created fresh queues")
     def _request_generator(self):
         """Generate requests for the streaming recognize API"""
+        # First request with config
         yield speech.StreamingRecognizeRequest(streaming_config=self.streaming_config)
+        # Audio chunks
         while not self.should_stop:
             try:
                 audio_chunk = self.audio_queue.get(timeout=0.1)
                 if audio_chunk is None:
                     log_info("📛 Poison pill received, stopping request generator")
                     break
                 yield speech.StreamingRecognizeRequest(audio_content=audio_chunk)
             except queue.Empty:
                 continue
             except Exception as e:
                 log_error(f"Error in request generator: {e}")
         log_info(f"📊 Request generator finished. Total chunks: {self.chunk_count}, Total bytes: {self.total_bytes}")
+    async def start_streaming(self, config: STTConfig) -> None:
         """Initialize streaming session with clean state"""
         try:
+            # Clean up any existing stream
             if self.is_streaming or self.stream_thread:
                 log_warning("⚠️ Previous stream still active, stopping it first")
                 await self.stop_streaming()
                 await asyncio.sleep(0.5)
+            # Reset session
             self._reset_session()
             self.single_utterance = config.single_utterance
             log_info(f"🎤 Starting Google STT streaming session #{self.session_id} with config: {config}")
+            # Create fresh queues
             self._create_fresh_queues()
             self.stop_event.clear()
+            self.should_stop = False
+            # Create new client
             self.client = speech.SpeechClient()
             log_info("✅ Created new Google Speech client")
                 model=config.model,
                 use_enhanced=config.use_enhanced,
                 max_alternatives=1,
                 metadata=speech.RecognitionMetadata(
                     interaction_type=speech.RecognitionMetadata.InteractionType.VOICE_SEARCH,
                     microphone_distance=speech.RecognitionMetadata.MicrophoneDistance.NEARFIELD,
                     recording_device_type=speech.RecognitionMetadata.RecordingDeviceType.PC,
                 )
+            )
             # Create streaming config with VAD
             self.streaming_config = speech.StreamingRecognitionConfig(
                 config=recognition_config,
+                interim_results=config.interim_results,
+                single_utterance=config.single_utterance,
+                enable_voice_activity_events=True  # ✅ VAD events enabled
             )
             log_info(f"📋 Streaming config created: interim_results={config.interim_results}, "
             self.is_streaming = True
+            # Start streaming thread
             self.stream_thread = threading.Thread(
                 target=self._run_stream,
                 name=f"GoogleSTT-Session-{self.session_id}"
             )
+            self.stream_thread.daemon = True
             self.stream_thread.start()
             log_info(f"✅ Google STT streaming session #{self.session_id} started successfully")
             log_error(f"❌ Failed to start Google STT streaming", error=str(e))
             self.is_streaming = False
             self.client = None
+            self._create_fresh_queues()
             raise
     def _run_stream(self):
         """Run the streaming recognition loop in a separate thread"""
         try:
+            log_info(f"🎤 Google STT stream thread started - Single utterance mode: {self.single_utterance}")
             # Create request generator
             requests = self._request_generator()
             # Create streaming client
             log_info("🎤 Creating Google STT streaming client...")
+            # Get responses (no timeout parameter!)
+            responses = self.client.streaming_recognize(self.streaming_config, requests)
+            # Track responses
             first_response_time = None
             response_count = 0
+            # Process responses
             for response in responses:
+                if self.should_stop:
+                    log_info("🛑 Stop flag detected, ending stream")
+                    break
                 response_count += 1
                 if first_response_time is None:
                     elapsed = first_response_time - self.stream_start_time
                     log_info(f"🎉 FIRST RESPONSE from Google STT after {elapsed:.2f}s")
+                # Check for VAD events
+                if hasattr(response, 'speech_event_type') and response.speech_event_type:
+                    event_type = response.speech_event_type
+                    log_info(f"🎙️ VAD Event: {event_type}")
+                    if event_type == speech.StreamingRecognizeResponse.SpeechEventType.END_OF_SINGLE_UTTERANCE:
+                        log_info("🔚 End of utterance detected by VAD")
+                # Log response
+                has_results = len(response.results) > 0 if hasattr(response, 'results') else False
+                log_info(f"📨 Google STT Response #{response_count}: has_results={has_results}")