Spaces:

UcsTurkey
/

flare

Building

App Files Files Community

ciyidogan commited on 6 days ago

Commit

a532986

verified ·

1 Parent(s): 6aeaf3c

Update stt/stt_google.py

Browse files

Files changed (1) hide show

stt/stt_google.py +69 -334

stt/stt_google.py CHANGED Viewed

@@ -1,19 +1,16 @@
 """
-Google Cloud Speech-to-Text Implementation
 """
-import asyncio
-from typing import AsyncIterator, Optional, List, Any
 from datetime import datetime
-import queue
-import threading
-import traceback
-import os
 from google.cloud import speech
-from google.cloud.speech import RecognitionConfig, StreamingRecognitionConfig
-import google.auth
 from utils.logger import log_info, log_error, log_debug, log_warning
 from .stt_interface import STTInterface, STTConfig, TranscriptionResult
 class GoogleSTT(STTInterface):
     def __init__(self, credentials_path: Optional[str] = None):
         """
@@ -21,16 +18,7 @@ class GoogleSTT(STTInterface):
         Args:
             credentials_path: Path to service account JSON file (optional if using default credentials)
         """
-        try:
-            # ✅ Debug için path kontrolü
-            if credentials_path:
-                import os
-                if not os.path.exists(credentials_path):
-                    log_error(f"❌ Credentials file not found at: {credentials_path}")
-                    raise FileNotFoundError(f"Credentials file not found: {credentials_path}")
-                log_info(f"📁 Using credentials from: {credentials_path}")
             # Initialize client
             if credentials_path:
                 self.client = speech.SpeechClient.from_service_account_file(credentials_path)
@@ -40,22 +28,6 @@ class GoogleSTT(STTInterface):
                 self.client = speech.SpeechClient()
                 log_info("✅ Google STT initialized with default credentials")
-            # Streaming state
-            self.is_streaming = False
-            self.audio_generator = None
-            self.responses_stream = None
-            self.audio_queue = queue.Queue()
-            self.results_queue = queue.Queue(maxsize=100)
-            # Session tracking
-            self.session_id = 0
-            self.total_audio_bytes = 0
-            self.total_chunks = 0
-            # Threading
-            self.stream_thread = None
-            self.stop_event = threading.Event()
         except Exception as e:
             log_error(f"❌ Failed to initialize Google STT: {str(e)}")
             raise
@@ -80,333 +52,96 @@ class GoogleSTT(STTInterface):
         }
         return language_map.get(language, language)
-    async def start_streaming(self, config: STTConfig) -> None:
-        """Initialize streaming session"""
         try:
-            # Stop any existing stream
-            if self.is_streaming:
-                log_warning("⚠️ Previous stream still active, stopping it first")
-                await self.stop_streaming()
-                await asyncio.sleep(0.5)
-            # Reset session data
-            self._reset_session_data()
-            log_info(f"🎤 Starting Google STT - Session #{self.session_id}")
-            # Configure recognition settings
             language_code = self._map_language_code(config.language)
-            """
-            # ✅ Google STT best practices for Turkish and single utterance
-            recognition_config = RecognitionConfig(
-                encoding=RecognitionConfig.AudioEncoding.LINEAR16,
-                sample_rate_hertz=16000,
-                language_code="tr-TR",
-                # ✅ Single utterance için ideal ayarlar
-                enable_automatic_punctuation=True,
-                # Model selection - latest_long for better accuracy
-                model="latest_long",
-                # Use enhanced model if available (better for Turkish)
-                use_enhanced=True,
-                # Single channel audio
-                audio_channel_count=1,
-                # Alternative transcripts for debugging
-                max_alternatives=1,
-                # Profanity filter disabled for accuracy
-                profanity_filter=False,
-                # Word level confidence
-                enable_word_confidence=False,
-                enable_spoken_punctuation=False,
-                enable_spoken_emojis=False,
-            )
-            # ✅ Streaming config - optimized for final results only
-            self.streaming_config = StreamingRecognitionConfig(
-                config=recognition_config,
-                single_utterance=False,
-                interim_results=True
-            )
-            """
-            # ✅ EN BASİT CONFIG - sadece zorunlu alanlar
             recognition_config = RecognitionConfig(
                 encoding=RecognitionConfig.AudioEncoding.LINEAR16,
-                sample_rate_hertz=16000,
-                language_code="tr-TR"
             )
-            # ✅ Streaming config - en basit hali
-            self.streaming_config = StreamingRecognitionConfig(
-                config=recognition_config,
-                interim_results=True
-            )
-            log_info(f"🔧 Google STT config: language={language_code}, "
-                    f"model=latest_long, enhanced=True, "
-                    f"single_utterance=True, interim_results=False")
-            # Start streaming in background thread
-            self.stop_event.clear()
-            self.stream_thread = threading.Thread(
-                target=self._stream_recognition,
-                daemon=True
-            )
-            self.stream_thread.start()
-            self.is_streaming = True
-            log_info(f"✅ Google STT started - Ready for speech")
-        except Exception as e:
-            log_error(f"❌ Failed to start Google STT", error=str(e))
-            self.is_streaming = False
-            raise
-    def _stream_recognition(self):
-        """Background thread for streaming recognition"""
-        try:
-            log_debug("🎙️ Starting recognition stream thread")
-            # ✅ Config'i logla
-            log_debug(f"Config details: {self.streaming_config}")
-            # Create audio generator
-            audio_generator = self._audio_generator()
-            # ✅ Daha detaylı hata yakalama
-            try:
-                # Start streaming recognition
-                responses = self.client.streaming_recognize(
-                    self.streaming_config,
-                    audio_generator
-                )
-            except Exception as api_error:
-                log_error(f"❌ Google API error: {str(api_error)}")
-                log_error(f"❌ Error type: {type(api_error).__name__}")
-                if hasattr(api_error, 'details'):
-                    log_error(f"❌ Error details: {api_error.details()}")
-                if hasattr(api_error, '__dict__'):
-                    log_error(f"❌ Error attributes: {api_error.__dict__}")
-                import traceback
-                log_error(f"❌ Full traceback: {traceback.format_exc()}")
-                raise
-            # Process responses
-            for response in responses:
-                if self.stop_event.is_set():
-                    break
-                if not response.results:
-                    continue
-                # Process each result
-                for result in response.results:
-                    if not result.alternatives:
-                        continue
-                    # Get best alternative
                     alternative = result.alternatives[0]
-                    # Only process if we have transcript
-                    if alternative.transcript:
-                        # ✅ Interim result'ları logla ama queue'ya koyma
-                        if not result.is_final:
-                            log_debug(f"📝 Interim transcript (ignored): '{alternative.transcript}'")
-                            continue
-                        # ✅ Sadece final result'ları işle
-                        transcription_result = TranscriptionResult(
-                            text=alternative.transcript,
-                            is_final=result.is_final,
-                            confidence=alternative.confidence,
-                            timestamp=datetime.now().timestamp()
-                        )
-                        try:
-                            self.results_queue.put(transcription_result)
-                            if result.is_final:
-                                log_info(f"🎯 FINAL TRANSCRIPT: '{alternative.transcript}' "
-                                       f"(confidence: {alternative.confidence:.2f})")
-                                # Single utterance mode will end stream after this
-                                break
-                            else:
-                                # This shouldn't happen with interim_results=False
-                                log_debug(f"📝 Transcript: '{alternative.transcript}'")
-                        except queue.Full:
-                            log_warning("⚠️ Results queue full")
-                # Check if stream ended due to single_utterance
-                if hasattr(response, 'speech_event_type'):
-                    if response.speech_event_type == speech.StreamingRecognizeResponse.SpeechEventType.END_OF_SINGLE_UTTERANCE:
-                        log_info("🔚 End of single utterance detected")
-                        break
-        except Exception as e:
-            if not self.stop_event.is_set():
-                log_error(f"❌ Recognition stream error: {str(e)}")
-                # Put error in queue
-                error_result = TranscriptionResult(
-                    text="",
-                    is_final=True,
-                    confidence=0.0,
-                    timestamp=datetime.now().timestamp()
-                )
-                self.results_queue.put(error_result)
-        finally:
-            log_debug("🎙️ Recognition stream thread ended")
-            self.is_streaming = False
-    def _audio_generator(self):
-        """Generator that yields audio chunks for streaming"""
-        chunk_count = 0
-        try:
-            while not self.stop_event.is_set():
-                try:
-                    # Get audio chunk with timeout
-                    chunk = self.audio_queue.get(timeout=0.1)
-                    if chunk is None:  # Sentinel value
-                        log_debug("🔚 Audio generator received sentinel, stopping")
-                        break
-                    # ✅ Debug için chunk bilgisi
-                    chunk_count += 1
-                    if chunk_count <= 5:  # İlk 5 chunk için detaylı log
-                        log_debug(f"🎵 Audio generator yielding chunk #{chunk_count}, size: {len(chunk)} bytes")
-                        # Chunk'ın byte tipinde olduğundan emin ol
-                        if not isinstance(chunk, bytes):
-                            log_error(f"❌ Chunk is not bytes! Type: {type(chunk)}")
-                            continue
-                    # ✅ Google API'nin beklediği format
-                    yield chunk
-                except queue.Empty:
-                    continue
-                except Exception as e:
-                    log_error(f"❌ Audio generator error: {str(e)}")
-                    break
-        finally:
-            log_debug(f"🎙️ Audio generator stopped after {chunk_count} chunks")
-    async def stream_audio(self, audio_chunk: bytes) -> AsyncIterator[TranscriptionResult]:
-        """Stream audio chunk and get transcription results"""
-        if not self.is_streaming:
-            raise RuntimeError("Streaming not started. Call start_streaming() first.")
-        try:
-            # ✅ Audio chunk tipini kontrol et
-            if not isinstance(audio_chunk, bytes):
-                log_error(f"❌ Audio chunk is not bytes! Type: {type(audio_chunk)}")
-                raise TypeError(f"Expected bytes, got {type(audio_chunk)}")
-            # ✅ Chunk boyutunu logla
-            if self.total_chunks < 5:
-                log_debug(f"📦 Adding audio chunk #{self.total_chunks} to queue, size: {len(audio_chunk)} bytes")
-            # Add audio to queue for background thread
-            self.audio_queue.put(audio_chunk)
-            self.total_chunks += 1
-            self.total_audio_bytes += len(audio_chunk)
-            # Log progress
-            if self.total_chunks % 50 == 0:
-                log_debug(f"📊 Processing... {self.total_chunks} chunks, {self.total_audio_bytes/1024:.1f}KB")
-            # Check for results
-            while True:
-                try:
-                    result = self.results_queue.get_nowait()
-                    # Log for debugging
-                    log_debug(f"🎯 Yielding result: is_final={result.is_final}, text='{result.text}'")
-                    yield result
-                    # If final result, stream will end
-                    if result.is_final:
-                        self.is_streaming = False
-                except queue.Empty:
-                    break
-        except Exception as e:
-            log_error(f"❌ Error streaming audio", error=str(e))
-            self.is_streaming = False
-            raise
-    async def stop_streaming(self) -> Optional[TranscriptionResult]:
-        """Stop streaming and clean up"""
-        if not self.is_streaming:
-            log_debug("Already stopped, nothing to do")
             return None
-        try:
-            log_info(f"🛑 Stopping Google STT session #{self.session_id}")
-            self.is_streaming = False
-            # Signal stop
-            self.stop_event.set()
-            # Send sentinel to audio queue
-            self.audio_queue.put(None)
-            # Wait for thread to finish
-            if self.stream_thread and self.stream_thread.is_alive():
-                self.stream_thread.join(timeout=2.0)
-            # Get final result if any
-            final_result = None
-            while not self.results_queue.empty():
-                try:
-                    result = self.results_queue.get_nowait()
-                    if result.is_final and result.text:
-                        final_result = result
-                except queue.Empty:
-                    break
-            log_info(f"✅ Google STT session #{self.session_id} stopped")
-            return final_result
         except Exception as e:
-            log_error(f"❌ Error during stop_streaming", error=str(e))
-            self.is_streaming = False
             return None
-    def _reset_session_data(self):
-        """Reset session-specific data"""
-        # Clear queues
-        while not self.audio_queue.empty():
-            try:
-                self.audio_queue.get_nowait()
-            except:
-                pass
-        while not self.results_queue.empty():
-            try:
-                self.results_queue.get_nowait()
-            except:
-                pass
-        # Reset counters
-        self.total_audio_bytes = 0
-        self.total_chunks = 0
-        self.session_id += 1
-        log_debug(f"🔄 Session data reset. New session ID: {self.session_id}")
-    def supports_realtime(self) -> bool:
-        """Google STT supports real-time streaming"""
-        return True
     def get_supported_languages(self) -> List[str]:
         """Get list of supported language codes"""
         # Google Cloud Speech-to-Text supported languages (partial list)
-        # Full list: https://cloud.google.com/speech-to-text/docs/languages
         return [
             "tr-TR", "en-US", "en-GB", "en-AU", "en-CA", "en-IN",
             "es-ES", "es-MX", "es-AR", "fr-FR", "fr-CA", "de-DE",

 """
+Google Cloud Speech-to-Text Implementation - Simple Batch Mode
 """
+from typing import Optional, List
 from datetime import datetime
+import io
+import wave
 from google.cloud import speech
+from google.cloud.speech import RecognitionConfig, RecognitionAudio
 from utils.logger import log_info, log_error, log_debug, log_warning
 from .stt_interface import STTInterface, STTConfig, TranscriptionResult
 class GoogleSTT(STTInterface):
     def __init__(self, credentials_path: Optional[str] = None):
         """
         Args:
             credentials_path: Path to service account JSON file (optional if using default credentials)
         """
+        try:
             # Initialize client
             if credentials_path:
                 self.client = speech.SpeechClient.from_service_account_file(credentials_path)
                 self.client = speech.SpeechClient()
                 log_info("✅ Google STT initialized with default credentials")
         except Exception as e:
             log_error(f"❌ Failed to initialize Google STT: {str(e)}")
             raise
         }
         return language_map.get(language, language)
+    async def transcribe(self, audio_data: bytes, config: STTConfig) -> Optional[TranscriptionResult]:
+        """Transcribe audio data using Google Cloud Speech API"""
         try:
+            # Check if we have audio to transcribe
+            if not audio_data:
+                log_warning("⚠️ No audio data provided")
+                return None
+            log_info(f"📊 Transcribing {len(audio_data)} bytes of audio")
+            # Convert to WAV format for better compatibility
+            wav_audio = self._convert_to_wav(audio_data, config.sample_rate)
+            # Configure recognition
             language_code = self._map_language_code(config.language)
             recognition_config = RecognitionConfig(
                 encoding=RecognitionConfig.AudioEncoding.LINEAR16,
+                sample_rate_hertz=config.sample_rate,
+                language_code=language_code,
+                enable_automatic_punctuation=config.enable_punctuation,
+                model=config.model,
+                use_enhanced=config.use_enhanced,
+                enable_word_time_offsets=config.enable_word_timestamps,
             )
+            # Create audio object
+            audio = RecognitionAudio(content=wav_audio)
+            # Perform synchronous recognition
+            log_info(f"🔄 Sending audio to Google Cloud Speech API...")
+            response = self.client.recognize(config=recognition_config, audio=audio)
+            # Process results
+            if response.results:
+                result = response.results[0]
+                if result.alternatives:
                     alternative = result.alternatives[0]
+                    # Extract word timestamps if available
+                    word_timestamps = None
+                    if config.enable_word_timestamps and hasattr(alternative, 'words'):
+                        word_timestamps = [
+                            {
+                                "word": word_info.word,
+                                "start_time": word_info.start_time.total_seconds(),
+                                "end_time": word_info.end_time.total_seconds()
+                            }
+                            for word_info in alternative.words
+                        ]
+                    transcription = TranscriptionResult(
+                        text=alternative.transcript,
+                        confidence=alternative.confidence,
+                        timestamp=datetime.now().timestamp(),
+                        language=language_code,
+                        word_timestamps=word_timestamps
+                    )
+                    log_info(f"✅ Transcription: '{alternative.transcript}' (confidence: {alternative.confidence:.2f})")
+                    return transcription
+            log_warning("⚠️ No transcription results")
             return None
         except Exception as e:
+            log_error(f"❌ Error during transcription: {str(e)}")
+            import traceback
+            log_error(f"Traceback: {traceback.format_exc()}")
             return None
+    def _convert_to_wav(self, audio_data: bytes, sample_rate: int) -> bytes:
+        """Convert raw PCM audio to WAV format"""
+        # Create WAV file in memory
+        wav_buffer = io.BytesIO()
+        with wave.open(wav_buffer, 'wb') as wav_file:
+            # Set WAV parameters
+            wav_file.setnchannels(1)  # Mono
+            wav_file.setsampwidth(2)  # 16-bit
+            wav_file.setframerate(sample_rate)
+            wav_file.writeframes(audio_data)
+        # Get WAV data
+        wav_buffer.seek(0)
+        return wav_buffer.read()
     def get_supported_languages(self) -> List[str]:
         """Get list of supported language codes"""
         # Google Cloud Speech-to-Text supported languages (partial list)
         return [
             "tr-TR", "en-US", "en-GB", "en-AU", "en-CA", "en-IN",
             "es-ES", "es-MX", "es-AR", "fr-FR", "fr-CA", "de-DE",