Spaces:

UcsTurkey
/

flare

Building

App Files Files Community

ciyidogan commited on 6 days ago

Commit

c4954b5

verified ·

1 Parent(s): 5789d1c

Update stt/stt_google.py

Browse files

Files changed (1) hide show

stt/stt_google.py +46 -213

stt/stt_google.py CHANGED Viewed

@@ -177,7 +177,7 @@ class GoogleSTT(STTInterface):
             log_error(f"❌ Silence trimming failed: {e}")
             return audio_data
-    async def transcribe(self, audio_data: bytes, config: STTConfig) -> Optional[TranscriptionResult]:
         """Transcribe audio data using Google Cloud Speech API"""
         try:
             # Check if we have audio to transcribe
@@ -189,75 +189,28 @@ class GoogleSTT(STTInterface):
             # ✅ Audio analizi
             self._analyze_audio_content(audio_data)
             # ✅ Silence trimming ekle
             trimmed_audio = self._trim_silence(audio_data)
-            # ✅ WAV formatında gönder - Google bu formatı daha iyi tanıyor
-            wav_audio = self._convert_to_wav_proper(trimmed_audio, config.sample_rate)
-            log_info(f"🔧 WAV conversion: {len(trimmed_audio)} PCM → {len(wav_audio)} WAV")
-            import tempfile
-            import os
-            # Raw PCM kaydet
-            pcm_file = tempfile.mktemp(suffix='.pcm')
-            with open(pcm_file, 'wb') as f:
-                f.write(trimmed_audio)
-            log_info(f"🔍 Raw PCM saved to: {pcm_file}")
-            # WAV kaydet
-            wav_file = tempfile.mktemp(suffix='.wav')
-            with open(wav_file, 'wb') as f:
-                f.write(wav_audio)
-            log_info(f"🔍 WAV saved to: {wav_file}")
-            # Test kodunla aynı şekilde test et
-            try:
-                import subprocess
-                result = subprocess.run([
-                    'python', 'app.py', wav_file
-                ], capture_output=True, text=True, timeout=30)
-                log_info(f"🔍 Test script result: {result.stdout}")
-                if result.stderr:
-                    log_error(f"🔍 Test script error: {result.stderr}")
-            except Exception as e:
-                log_warning(f"Could not run test script: {e}")
-            # Cleanup files after test
-            try:
-                os.unlink(pcm_file)
-                os.unlink(wav_file)
-            except:
-                pass
-            # Configure recognition
-            language_code = self._map_language_code(config.language)
-            # ✅ WAV formatı için doğru config
             recognition_config = RecognitionConfig(
                 encoding=RecognitionConfig.AudioEncoding.LINEAR16,
                 sample_rate_hertz=config.sample_rate,
-                language_code=language_code,
                 audio_channel_count=1,
                 enable_separate_recognition_per_channel=False,
-                # ✅ Enhanced model kullan
-                model="latest_long",
-                use_enhanced=True,
-                # ✅ Punctuation ekle
-                enable_automatic_punctuation=config.enable_punctuation if hasattr(config, 'enable_punctuation') else True,
-                # ✅ Profanity filter'ı kapat (daha iyi tanıma için)
-                profanity_filter=False,
-                # ✅ Audio analizi için metadata
-                metadata=speech.RecognitionMetadata(
-                    interaction_type=speech.RecognitionMetadata.InteractionType.VOICE_COMMAND,
-                    microphone_distance=speech.RecognitionMetadata.MicrophoneDistance.NEARFIELD,
-                    original_media_type=speech.RecognitionMetadata.OriginalMediaType.AUDIO,
-                    recording_device_type=speech.RecognitionMetadata.RecordingDeviceType.PC
-                )
             )
-            log_debug(f"Recognition config: language={language_code}, sample_rate={config.sample_rate}, model=latest_long")
             # ✅ Create audio object with WAV data
             audio = RecognitionAudio(content=wav_audio)
@@ -295,24 +248,12 @@ class GoogleSTT(STTInterface):
                     if result.alternatives:
                         alternative = result.alternatives[0]
-                        # Extract word timestamps if available
-                        word_timestamps = None
-                        if config.enable_word_timestamps and hasattr(alternative, 'words'):
-                            word_timestamps = [
-                                {
-                                    "word": word_info.word,
-                                    "start_time": word_info.start_time.total_seconds(),
-                                    "end_time": word_info.end_time.total_seconds()
-                                }
-                                for word_info in alternative.words
-                            ]
                         transcription = TranscriptionResult(
                             text=alternative.transcript,
                             confidence=alternative.confidence,
                             timestamp=datetime.now().timestamp(),
-                            language=language_code,
-                            word_timestamps=word_timestamps
                         )
                         log_info(f"✅ Transcription: '{alternative.transcript}' (confidence: {alternative.confidence:.2f})")
@@ -326,154 +267,46 @@ class GoogleSTT(STTInterface):
             import traceback
             log_error(f"Traceback: {traceback.format_exc()}")
             return None
-    def _convert_to_wav_proper(self, audio_data: bytes, sample_rate: int) -> bytes:
-            """Convert raw PCM to proper WAV format - EXACTLY like test code"""
             try:
-                # ✅ Test kodundan aynı WAV header oluşturma
-                length = len(audio_data)
-                # ✅ Debug: İlk birkaç byte'ı kontrol et
-                if length >= 20:
-                    first_samples = struct.unpack('<10h', audio_data[:20])
-                    log_info(f"🔍 First 10 PCM samples: {first_samples}")
-                    log_info(f"🔍 Max amplitude in first 10: {max(abs(s) for s in first_samples)}")
-                # ✅ BytesIO kullanarak memory'de WAV oluştur
-                wav_buffer = io.BytesIO()
-                # ✅ Test kodundan aynı header yazma
-                def write_string(data: str):
-                    wav_buffer.write(data.encode('ascii'))
-                def write_uint32(value: int):
-                    wav_buffer.write(struct.pack('<I', value))
-                def write_uint16(value: int):
-                    wav_buffer.write(struct.pack('<H', value))
-                # RIFF header
-                write_string('RIFF')
-                write_uint32(36 + length)  # File size - 8
-                write_string('WAVE')
-                # fmt chunk
-                write_string('fmt ')
-                write_uint32(16)  # Subchunk1Size (PCM)
-                write_uint16(1)   # AudioFormat (PCM = 1)
-                write_uint16(1)   # NumChannels (mono)
-                write_uint32(sample_rate)  # SampleRate
-                write_uint32(sample_rate * 1 * 2)  # ByteRate
-                write_uint16(1 * 2)  # BlockAlign
-                write_uint16(16)  # BitsPerSample
-                # data chunk
-                write_string('data')
-                write_uint32(length)  # Subchunk2Size
-                # Audio data
-                wav_buffer.write(audio_data)
-                wav_data = wav_buffer.getvalue()
-                wav_buffer.close()
-                # ✅ Debug: WAV header'ını kontrol et
-                if len(wav_data) >= 44:
-                    header_bytes = wav_data[:44]
-                    log_info(f"🔍 WAV header (first 44 bytes): {header_bytes.hex()}")
-                    # Header parse et
-                    riff = header_bytes[0:4].decode('ascii')
-                    file_size = struct.unpack('<I', header_bytes[4:8])[0]
-                    wave = header_bytes[8:12].decode('ascii')
-                    fmt_chunk = header_bytes[12:16].decode('ascii')
-                    fmt_size = struct.unpack('<I', header_bytes[16:20])[0]
-                    audio_format = struct.unpack('<H', header_bytes[20:22])[0]
-                    channels = struct.unpack('<H', header_bytes[22:24])[0]
-                    sample_rate_check = struct.unpack('<I', header_bytes[24:28])[0]
-                    byte_rate = struct.unpack('<I', header_bytes[28:32])[0]
-                    block_align = struct.unpack('<H', header_bytes[32:34])[0]
-                    bits_per_sample = struct.unpack('<H', header_bytes[34:36])[0]
-                    data_chunk = header_bytes[36:40].decode('ascii')
-                    data_size = struct.unpack('<I', header_bytes[40:44])[0]
-                    log_info(f"🔍 WAV Header Analysis:")
-                    log_info(f"  RIFF: {riff}")
-                    log_info(f"  File Size: {file_size}")
-                    log_info(f"  WAVE: {wave}")
-                    log_info(f"  FMT Chunk: {fmt_chunk}")
-                    log_info(f"  Audio Format: {audio_format} (should be 1)")
-                    log_info(f"  Channels: {channels} (should be 1)")
-                    log_info(f"  Sample Rate: {sample_rate_check} (should be {sample_rate})")
-                    log_info(f"  Byte Rate: {byte_rate}")
-                    log_info(f"  Block Align: {block_align}")
-                    log_info(f"  Bits Per Sample: {bits_per_sample}")
-                    log_info(f"  Data Chunk: {data_chunk}")
-                    log_info(f"  Data Size: {data_size} (should be {length})")
-                    # ✅ Validation
-                    if riff != 'RIFF':
-                        log_error(f"❌ Invalid RIFF header: {riff}")
-                    if wave != 'WAVE':
-                        log_error(f"❌ Invalid WAVE header: {wave}")
-                    if audio_format != 1:
-                        log_error(f"❌ Invalid audio format: {audio_format}")
-                    if channels != 1:
-                        log_error(f"❌ Invalid channel count: {channels}")
-                    if sample_rate_check != sample_rate:
-                        log_error(f"❌ Invalid sample rate: {sample_rate_check}")
-                    if data_size != length:
-                        log_error(f"❌ Invalid data size: {data_size} vs {length}")
-                # ✅ Debug: WAV dosyasını geçici olarak kaydet (test için)
-                import tempfile
-                import os
-                temp_file = tempfile.mktemp(suffix='.wav')
-                try:
-                    with open(temp_file, 'wb') as f:
-                        f.write(wav_data)
-                    # WAV dosyasının gerçekten valid olduğunu kontrol et
-                    import wave
-                    with wave.open(temp_file, 'rb') as wav_file:
-                        wav_channels = wav_file.getnchannels()
-                        wav_sample_width = wav_file.getsampwidth()
-                        wav_sample_rate = wav_file.getframerate()
-                        wav_frames = wav_file.getnframes()
-                        log_info(f"🔍 WAV File Validation:")
-                        log_info(f"  Channels: {wav_channels}")
-                        log_info(f"  Sample Width: {wav_sample_width}")
-                        log_info(f"  Sample Rate: {wav_sample_rate}")
-                        log_info(f"  Frames: {wav_frames}")
-                        log_info(f"  Duration: {wav_frames / wav_sample_rate:.2f}s")
-                        # İlk birkaç frame'i oku
-                        first_frames = wav_file.readframes(10)
-                        if first_frames:
-                            first_samples_wav = struct.unpack('<10h', first_frames[:20])
-                            log_info(f"🔍 First 10 samples from WAV: {first_samples_wav}")
-                    log_info(f"✅ WAV file created and validated: {temp_file}")
-                except Exception as e:
-                    log_error(f"❌ WAV validation failed: {e}")
-                finally:
-                    # Cleanup
-                    if os.path.exists(temp_file):
-                        os.unlink(temp_file)
-                log_info(f"🔧 WAV specs: 1ch, {sample_rate}Hz, 16bit")
                 return wav_data
-            except Exception as e:
-                log_error(f"❌ WAV conversion failed: {e}")
-                import traceback
-                log_error(f"Traceback: {traceback.format_exc()}")
-                # Fallback to raw PCM
-                return audio_data
     def get_supported_languages(self) -> List[str]:
         """Get list of supported language codes"""

             log_error(f"❌ Silence trimming failed: {e}")
             return audio_data
+async def transcribe(self, audio_data: bytes, config: STTConfig) -> Optional[TranscriptionResult]:
         """Transcribe audio data using Google Cloud Speech API"""
         try:
             # Check if we have audio to transcribe
             # ✅ Audio analizi
             self._analyze_audio_content(audio_data)
             # ✅ Silence trimming ekle
             trimmed_audio = self._trim_silence(audio_data)
+            if len(trimmed_audio) < 8000:  # 0.5 saniyeden az
+                log_warning("⚠️ Audio too short after trimming")
+                return None
+            # ✅ Test kodundan EXACT aynı format - wave modülü kullan
+            wav_audio = self._create_wav_like_test(trimmed_audio, config.sample_rate)
+            log_info(f"🔧 WAV conversion: {len(trimmed_audio)} PCM → {len(wav_audio)} WAV")
+            # Configure recognition - TEST KODUNDAN EXACT AYNI
             recognition_config = RecognitionConfig(
                 encoding=RecognitionConfig.AudioEncoding.LINEAR16,
                 sample_rate_hertz=config.sample_rate,
+                language_code="tr-TR",  # Hardcode tr-TR like test
                 audio_channel_count=1,
                 enable_separate_recognition_per_channel=False,
             )
+            log_debug(f"Recognition config: language=tr-TR, sample_rate={config.sample_rate}")
             # ✅ Create audio object with WAV data
             audio = RecognitionAudio(content=wav_audio)
                     if result.alternatives:
                         alternative = result.alternatives[0]
                         transcription = TranscriptionResult(
                             text=alternative.transcript,
                             confidence=alternative.confidence,
                             timestamp=datetime.now().timestamp(),
+                            language="tr-TR",
+                            word_timestamps=None
                         )
                         log_info(f"✅ Transcription: '{alternative.transcript}' (confidence: {alternative.confidence:.2f})")
             import traceback
             log_error(f"Traceback: {traceback.format_exc()}")
             return None
+    def _create_wav_like_test(self, audio_data: bytes, sample_rate: int) -> bytes:
+        """Create WAV exactly like test code using wave module"""
+        try:
+            import tempfile
+            import os
+            import wave
+            # Geçici dosya oluştur
+            temp_wav = tempfile.mktemp(suffix='.wav')
             try:
+                # Wave file oluştur - test kodundaki gibi
+                with wave.open(temp_wav, 'wb') as wav_file:
+                    wav_file.setnchannels(1)      # Mono
+                    wav_file.setsampwidth(2)      # 16-bit
+                    wav_file.setframerate(sample_rate)  # 16kHz
+                    wav_file.writeframes(audio_data)
+                # Dosyayı geri oku
+                with open(temp_wav, 'rb') as f:
+                    wav_data = f.read()
+                log_info(f"🔧 WAV created using wave module: {len(wav_data)} bytes")
+                # Debug: Wave file'ı kontrol et
+                with wave.open(temp_wav, 'rb') as wav_file:
+                    log_info(f"🔧 Wave validation: {wav_file.getnchannels()}ch, {wav_file.getframerate()}Hz, {wav_file.getnframes()} frames")
                 return wav_data
+            finally:
+                # Cleanup
+                if os.path.exists(temp_wav):
+                    os.unlink(temp_wav)
+        except Exception as e:
+            log_error(f"❌ Wave module WAV creation failed: {e}")
+            # Fallback to manual method
+            return self._convert_to_wav_proper(audio_data, sample_rate)
     def get_supported_languages(self) -> List[str]:
         """Get list of supported language codes"""