Spaces:

UcsTurkey
/

flare

Building

App Files Files Community

ciyidogan commited on 8 days ago

Commit

9c58077

verified ·

1 Parent(s): 308dbba

Update stt/stt_google.py

Browse files

Files changed (1) hide show

stt/stt_google.py +170 -83

stt/stt_google.py CHANGED Viewed

@@ -5,6 +5,7 @@ from typing import Optional, List
 from datetime import datetime
 import io
 import wave
 from google.cloud import speech
 from google.cloud.speech import RecognitionConfig, RecognitionAudio
 from utils.logger import log_info, log_error, log_debug, log_warning
@@ -65,7 +66,66 @@ class GoogleSTT(STTInterface):
         # Default to the language itself if not in map
         return language_map.get(language, language)
     async def transcribe(self, audio_data: bytes, config: STTConfig) -> Optional[TranscriptionResult]:
         """Transcribe audio data using Google Cloud Speech API"""
         try:
@@ -76,85 +136,99 @@ class GoogleSTT(STTInterface):
             log_info(f"📊 Transcribing {len(audio_data)} bytes of audio")
-            # ✅ Debug - audio verisi analizi
-            if len(audio_data) > 100:
-                # İlk ve son 50 byte'ı kontrol et
-                first_50 = audio_data[:50]
-                last_50 = audio_data[-50:]
-                log_debug(f"Audio first 50 bytes: {first_50.hex()}")
-                log_debug(f"Audio last 50 bytes: {last_50.hex()}")
-                # Ortalama amplitude kontrolü
-                import struct
-                samples = struct.unpack(f'{len(audio_data)//2}h', audio_data)
-                avg_amplitude = sum(abs(s) for s in samples) / len(samples)
-                max_amplitude = max(abs(s) for s in samples)
-                log_debug(f"Audio stats: avg_amplitude={avg_amplitude:.1f}, max_amplitude={max_amplitude}")
-            # ✅ Convert to WAV format for better compatibility
-            wav_audio = self._convert_to_wav(audio_data, config.sample_rate)
             log_info(f"🔧 WAV conversion: {len(audio_data)} PCM → {len(wav_audio)} WAV")
             # Configure recognition
             language_code = self._map_language_code(config.language)
-            # ✅ WAV audio kullanıyoruz artık
             recognition_config = RecognitionConfig(
                 encoding=RecognitionConfig.AudioEncoding.LINEAR16,
-                sample_rate_hertz=16000,
-                language_code="tr-TR",
-                audio_channel_count=1,  # Frontend mono audio gönderiyor
                 enable_separate_recognition_per_channel=False,
             )
-            log_debug(f"Recognition config: language={language_code}, sample_rate={config.sample_rate}, model={config.model}")
-            # ✅ Create audio object with WAV data (not raw PCM)
-            audio = RecognitionAudio(content=wav_audio)  # wav_audio kullan, audio_data değil
             # Perform synchronous recognition
             log_info(f"🔄 Sending {len(wav_audio)} bytes WAV to Google Cloud Speech API...")
             response = self.client.recognize(config=recognition_config, audio=audio)
-            # ✅ Debug response
             log_debug(f"API Response: {response}")
             log_info(f"🔍 Google response details:")
             log_info(f"- Has results: {bool(response.results)}")
             log_info(f"- Results count: {len(response.results)}")
             if hasattr(response, 'total_billed_time'):
-                log_info(f"- Billed time: {response.total_billed_time.total_seconds()}s")
             else:
                 log_info(f"- Billed time: 0s (no audio processed)")
             # Process results
             if response.results:
-                result = response.results[0]
-                if result.alternatives:
-                    alternative = result.alternatives[0]
-                    # Extract word timestamps if available
-                    word_timestamps = None
-                    if config.enable_word_timestamps and hasattr(alternative, 'words'):
-                        word_timestamps = [
-                            {
-                                "word": word_info.word,
-                                "start_time": word_info.start_time.total_seconds(),
-                                "end_time": word_info.end_time.total_seconds()
-                            }
-                            for word_info in alternative.words
-                        ]
-                    transcription = TranscriptionResult(
-                        text=alternative.transcript,
-                        confidence=alternative.confidence,
-                        timestamp=datetime.now().timestamp(),
-                        language=language_code,
-                        word_timestamps=word_timestamps
-                    )
-                    log_info(f"✅ Transcription: '{alternative.transcript}' (confidence: {alternative.confidence:.2f})")
-                    return transcription
             log_warning("⚠️ No transcription results - Google couldn't recognize speech")
             return None
@@ -165,44 +239,57 @@ class GoogleSTT(STTInterface):
             log_error(f"Traceback: {traceback.format_exc()}")
             return None
-    def _convert_to_wav(self, audio_data: bytes, sample_rate: int) -> bytes:
-        """Convert raw PCM to proper WAV format"""
         try:
-            import struct
-            # WAV file parameters
-            channels = 1
-            sample_width = 2  # 16-bit
-            frame_rate = sample_rate
-            audio_length = len(audio_data)
-            # Create proper WAV header
-            wav_header = struct.pack('<4sI4s4sIHHIIHH4sI',
-                b'RIFF',                                    # ChunkID
-                36 + audio_length,                          # ChunkSize
-                b'WAVE',                                    # Format
-                b'fmt ',                                    # Subchunk1ID
-                16,                                         # Subchunk1Size (PCM)
-                1,                                          # AudioFormat (PCM = 1)
-                channels,                                   # NumChannels
-                frame_rate,                                 # SampleRate
-                frame_rate * channels * sample_width,       # ByteRate
-                channels * sample_width,                    # BlockAlign
-                sample_width * 8,                           # BitsPerSample
-                b'data',                                    # Subchunk2ID
-                audio_length                                # Subchunk2Size
-            )
-            # Combine header and audio data
-            wav_data = wav_header + audio_data
-            log_info(f"🔧 WAV conversion: {len(audio_data)} PCM → {len(wav_data)} WAV")
-            log_info(f"🔧 WAV specs: {channels}ch, {frame_rate}Hz, {sample_width*8}bit")
             return wav_data
         except Exception as e:
-            log_error(f"WAV conversion failed: {e}")
             # Fallback to raw PCM
             return audio_data

 from datetime import datetime
 import io
 import wave
+import struct
 from google.cloud import speech
 from google.cloud.speech import RecognitionConfig, RecognitionAudio
 from utils.logger import log_info, log_error, log_debug, log_warning
         # Default to the language itself if not in map
         return language_map.get(language, language)
+    def _analyze_audio_content(self, audio_data: bytes):
+        """Analyze audio content for debugging"""
+        try:
+            if len(audio_data) < 100:
+                log_warning(f"⚠️ Very short audio data: {len(audio_data)} bytes")
+                return
+            # Convert to samples
+            samples = struct.unpack(f'{len(audio_data)//2}h', audio_data)
+            total_samples = len(samples)
+            # Basic stats
+            non_zero_samples = [s for s in samples if s != 0]
+            zero_count = total_samples - len(non_zero_samples)
+            zero_percentage = (zero_count / total_samples) * 100
+            log_info(f"🔍 Audio stats: {total_samples} total samples, {zero_count} zeros ({zero_percentage:.1f}%)")
+            if non_zero_samples:
+                avg_amplitude = sum(abs(s) for s in non_zero_samples) / len(non_zero_samples)
+                max_amplitude = max(abs(s) for s in non_zero_samples)
+                log_info(f"🔍 Non-zero stats: avg={avg_amplitude:.1f}, max={max_amplitude}")
+                # Section analysis
+                section_size = total_samples // 10
+                log_info(f"🔍 Section analysis (each {section_size} samples):")
+                for i in range(10):
+                    start = i * section_size
+                    end = min((i + 1) * section_size, total_samples)
+                    section = samples[start:end]
+                    section_non_zero = [s for s in section if s != 0]
+                    section_zeros = len(section) - len(section_non_zero)
+                    section_zero_pct = (section_zeros / len(section)) * 100
+                    if section_non_zero:
+                        section_max = max(abs(s) for s in section_non_zero)
+                        section_avg = sum(abs(s) for s in section_non_zero) / len(section_non_zero)
+                        log_info(f"Section {i+1}: max={section_max}, avg={section_avg:.1f}, zeros={section_zero_pct:.1f}%")
+                # Find where speech starts (first significant activity)
+                speech_threshold = 1000  # Minimum amplitude to consider as speech
+                speech_start = None
+                for i, sample in enumerate(samples):
+                    if abs(sample) > speech_threshold:
+                        speech_start = i
+                        break
+                if speech_start is not None:
+                    log_info(f"🎤 Speech detected starting at sample {speech_start} ({speech_start/16000:.2f}s)")
+                else:
+                    log_warning(f"⚠️ No clear speech signal detected (threshold: {speech_threshold})")
+            else:
+                log_warning(f"⚠️ All samples are zero - no audio content")
+        except Exception as e:
+            log_error(f"❌ Error analyzing audio: {e}")
     async def transcribe(self, audio_data: bytes, config: STTConfig) -> Optional[TranscriptionResult]:
         """Transcribe audio data using Google Cloud Speech API"""
         try:
             log_info(f"📊 Transcribing {len(audio_data)} bytes of audio")
+            # ✅ Audio analizi
+            self._analyze_audio_content(audio_data)
+            # ✅ WAV formatında gönder - Google bu formatı daha iyi tanıyor
+            wav_audio = self._convert_to_wav_proper(audio_data, config.sample_rate)
             log_info(f"🔧 WAV conversion: {len(audio_data)} PCM → {len(wav_audio)} WAV")
             # Configure recognition
             language_code = self._map_language_code(config.language)
+            # ✅ WAV formatı için doğru config
             recognition_config = RecognitionConfig(
                 encoding=RecognitionConfig.AudioEncoding.LINEAR16,
+                sample_rate_hertz=config.sample_rate,
+                language_code=language_code,
+                audio_channel_count=1,
                 enable_separate_recognition_per_channel=False,
+                # ✅ Enhanced model kullan
+                model="latest_long",
+                use_enhanced=True,
+                # ✅ Punctuation ekle
+                enable_automatic_punctuation=config.enable_punctuation if hasattr(config, 'enable_punctuation') else True,
+                # ✅ Profanity filter'ı kapat (daha iyi tanıma için)
+                profanity_filter=False,
+                # ✅ Audio analizi için metadata
+                metadata=speech.RecognitionMetadata(
+                    interaction_type=speech.RecognitionMetadata.InteractionType.VOICE_COMMAND,
+                    microphone_distance=speech.RecognitionMetadata.MicrophoneDistance.NEARFIELD,
+                    original_media_type=speech.RecognitionMetadata.OriginalMediaType.AUDIO,
+                    recording_device_type=speech.RecognitionMetadata.RecordingDeviceType.PC
+                )
             )
+            log_debug(f"Recognition config: language={language_code}, sample_rate={config.sample_rate}, model=latest_long")
+            # ✅ Create audio object with WAV data
+            audio = RecognitionAudio(content=wav_audio)
             # Perform synchronous recognition
             log_info(f"🔄 Sending {len(wav_audio)} bytes WAV to Google Cloud Speech API...")
             response = self.client.recognize(config=recognition_config, audio=audio)
+            # ✅ Detaylı response analizi
             log_debug(f"API Response: {response}")
             log_info(f"🔍 Google response details:")
             log_info(f"- Has results: {bool(response.results)}")
             log_info(f"- Results count: {len(response.results)}")
+            # ✅ Request ID'yi logla
+            if hasattr(response, '_pb') and hasattr(response._pb, 'request_id'):
+                log_info(f"- Request ID: {response._pb.request_id}")
             if hasattr(response, 'total_billed_time'):
+                billed_seconds = response.total_billed_time.total_seconds()
+                log_info(f"- Billed time: {billed_seconds}s")
+                # ✅ Eğer billed time 0 ise, Google hiç audio işlememiş demektir
+                if billed_seconds == 0:
+                    log_error("❌ Google didn't process any audio - possible format issue")
+                    return None
             else:
                 log_info(f"- Billed time: 0s (no audio processed)")
             # Process results
             if response.results:
+                for i, result in enumerate(response.results):
+                    log_debug(f"Result {i}: {result}")
+                    if result.alternatives:
+                        alternative = result.alternatives[0]
+                        # Extract word timestamps if available
+                        word_timestamps = None
+                        if config.enable_word_timestamps and hasattr(alternative, 'words'):
+                            word_timestamps = [
+                                {
+                                    "word": word_info.word,
+                                    "start_time": word_info.start_time.total_seconds(),
+                                    "end_time": word_info.end_time.total_seconds()
+                                }
+                                for word_info in alternative.words
+                            ]
+                        transcription = TranscriptionResult(
+                            text=alternative.transcript,
+                            confidence=alternative.confidence,
+                            timestamp=datetime.now().timestamp(),
+                            language=language_code,
+                            word_timestamps=word_timestamps
+                        )
+                        log_info(f"✅ Transcription: '{alternative.transcript}' (confidence: {alternative.confidence:.2f})")
+                        return transcription
             log_warning("⚠️ No transcription results - Google couldn't recognize speech")
             return None
             log_error(f"Traceback: {traceback.format_exc()}")
             return None
+    def _convert_to_wav_proper(self, audio_data: bytes, sample_rate: int) -> bytes:
+        """Convert raw PCM to proper WAV format - EXACTLY like test code"""
         try:
+            # ✅ Test kodundan aynı WAV header oluşturma
+            length = len(audio_data)
+            buffer_size = 44 + length  # WAV header + data
+            # ✅ BytesIO kullanarak memory'de WAV oluştur
+            wav_buffer = io.BytesIO()
+            # ✅ Test kodundan aynı header yazma
+            def write_string(data: str):
+                wav_buffer.write(data.encode('ascii'))
+            def write_uint32(value: int):
+                wav_buffer.write(struct.pack('<I', value))
+            def write_uint16(value: int):
+                wav_buffer.write(struct.pack('<H', value))
+            # RIFF header
+            write_string('RIFF')
+            write_uint32(36 + length)  # File size - 8
+            write_string('WAVE')
+            # fmt chunk
+            write_string('fmt ')
+            write_uint32(16)  # Subchunk1Size (PCM)
+            write_uint16(1)   # AudioFormat (PCM = 1)
+            write_uint16(1)   # NumChannels (mono)
+            write_uint32(sample_rate)  # SampleRate
+            write_uint32(sample_rate * 1 * 2)  # ByteRate
+            write_uint16(1 * 2)  # BlockAlign
+            write_uint16(16)  # BitsPerSample
+            # data chunk
+            write_string('data')
+            write_uint32(length)  # Subchunk2Size
+            # Audio data
+            wav_buffer.write(audio_data)
+            wav_data = wav_buffer.getvalue()
+            wav_buffer.close()
+            log_info(f"🔧 WAV specs: 1ch, {sample_rate}Hz, 16bit")
             return wav_data
         except Exception as e:
+            log_error(f"❌ WAV conversion failed: {e}")
             # Fallback to raw PCM
             return audio_data