ciyidogan commited on
Commit
c4954b5
·
verified ·
1 Parent(s): 5789d1c

Update stt/stt_google.py

Browse files
Files changed (1) hide show
  1. stt/stt_google.py +46 -213
stt/stt_google.py CHANGED
@@ -177,7 +177,7 @@ class GoogleSTT(STTInterface):
177
  log_error(f"❌ Silence trimming failed: {e}")
178
  return audio_data
179
 
180
- async def transcribe(self, audio_data: bytes, config: STTConfig) -> Optional[TranscriptionResult]:
181
  """Transcribe audio data using Google Cloud Speech API"""
182
  try:
183
  # Check if we have audio to transcribe
@@ -189,75 +189,28 @@ class GoogleSTT(STTInterface):
189
 
190
  # ✅ Audio analizi
191
  self._analyze_audio_content(audio_data)
192
-
193
  # ✅ Silence trimming ekle
194
  trimmed_audio = self._trim_silence(audio_data)
195
 
196
- # WAV formatında gönder - Google bu formatı daha iyi tanıyor
197
- wav_audio = self._convert_to_wav_proper(trimmed_audio, config.sample_rate)
198
- log_info(f"🔧 WAV conversion: {len(trimmed_audio)} PCM → {len(wav_audio)} WAV")
199
-
200
- import tempfile
201
- import os
202
-
203
- # Raw PCM kaydet
204
- pcm_file = tempfile.mktemp(suffix='.pcm')
205
- with open(pcm_file, 'wb') as f:
206
- f.write(trimmed_audio)
207
- log_info(f"🔍 Raw PCM saved to: {pcm_file}")
208
-
209
- # WAV kaydet
210
- wav_file = tempfile.mktemp(suffix='.wav')
211
- with open(wav_file, 'wb') as f:
212
- f.write(wav_audio)
213
- log_info(f"🔍 WAV saved to: {wav_file}")
214
-
215
- # Test kodunla aynı şekilde test et
216
- try:
217
- import subprocess
218
- result = subprocess.run([
219
- 'python', 'app.py', wav_file
220
- ], capture_output=True, text=True, timeout=30)
221
- log_info(f"🔍 Test script result: {result.stdout}")
222
- if result.stderr:
223
- log_error(f"🔍 Test script error: {result.stderr}")
224
- except Exception as e:
225
- log_warning(f"Could not run test script: {e}")
226
 
227
- # Cleanup files after test
228
- try:
229
- os.unlink(pcm_file)
230
- os.unlink(wav_file)
231
- except:
232
- pass
233
 
234
- # Configure recognition
235
- language_code = self._map_language_code(config.language)
236
-
237
- # ✅ WAV formatı için doğru config
238
  recognition_config = RecognitionConfig(
239
  encoding=RecognitionConfig.AudioEncoding.LINEAR16,
240
  sample_rate_hertz=config.sample_rate,
241
- language_code=language_code,
242
  audio_channel_count=1,
243
  enable_separate_recognition_per_channel=False,
244
- # ✅ Enhanced model kullan
245
- model="latest_long",
246
- use_enhanced=True,
247
- # ✅ Punctuation ekle
248
- enable_automatic_punctuation=config.enable_punctuation if hasattr(config, 'enable_punctuation') else True,
249
- # ✅ Profanity filter'ı kapat (daha iyi tanıma için)
250
- profanity_filter=False,
251
- # ✅ Audio analizi için metadata
252
- metadata=speech.RecognitionMetadata(
253
- interaction_type=speech.RecognitionMetadata.InteractionType.VOICE_COMMAND,
254
- microphone_distance=speech.RecognitionMetadata.MicrophoneDistance.NEARFIELD,
255
- original_media_type=speech.RecognitionMetadata.OriginalMediaType.AUDIO,
256
- recording_device_type=speech.RecognitionMetadata.RecordingDeviceType.PC
257
- )
258
  )
259
 
260
- log_debug(f"Recognition config: language={language_code}, sample_rate={config.sample_rate}, model=latest_long")
261
 
262
  # ✅ Create audio object with WAV data
263
  audio = RecognitionAudio(content=wav_audio)
@@ -295,24 +248,12 @@ class GoogleSTT(STTInterface):
295
  if result.alternatives:
296
  alternative = result.alternatives[0]
297
 
298
- # Extract word timestamps if available
299
- word_timestamps = None
300
- if config.enable_word_timestamps and hasattr(alternative, 'words'):
301
- word_timestamps = [
302
- {
303
- "word": word_info.word,
304
- "start_time": word_info.start_time.total_seconds(),
305
- "end_time": word_info.end_time.total_seconds()
306
- }
307
- for word_info in alternative.words
308
- ]
309
-
310
  transcription = TranscriptionResult(
311
  text=alternative.transcript,
312
  confidence=alternative.confidence,
313
  timestamp=datetime.now().timestamp(),
314
- language=language_code,
315
- word_timestamps=word_timestamps
316
  )
317
 
318
  log_info(f"✅ Transcription: '{alternative.transcript}' (confidence: {alternative.confidence:.2f})")
@@ -326,154 +267,46 @@ class GoogleSTT(STTInterface):
326
  import traceback
327
  log_error(f"Traceback: {traceback.format_exc()}")
328
  return None
329
-
330
- def _convert_to_wav_proper(self, audio_data: bytes, sample_rate: int) -> bytes:
331
- """Convert raw PCM to proper WAV format - EXACTLY like test code"""
 
 
 
 
 
 
 
 
332
  try:
333
- # Test kodundan aynı WAV header oluşturma
334
- length = len(audio_data)
 
 
 
 
335
 
336
- # Debug: İlk birkaç byte'ı kontrol et
337
- if length >= 20:
338
- first_samples = struct.unpack('<10h', audio_data[:20])
339
- log_info(f"🔍 First 10 PCM samples: {first_samples}")
340
- log_info(f"🔍 Max amplitude in first 10: {max(abs(s) for s in first_samples)}")
341
 
342
- # BytesIO kullanarak memory'de WAV oluştur
343
- wav_buffer = io.BytesIO()
344
 
345
- # Test kodundan aynı header yazma
346
- def write_string(data: str):
347
- wav_buffer.write(data.encode('ascii'))
348
-
349
- def write_uint32(value: int):
350
- wav_buffer.write(struct.pack('<I', value))
351
-
352
- def write_uint16(value: int):
353
- wav_buffer.write(struct.pack('<H', value))
354
-
355
- # RIFF header
356
- write_string('RIFF')
357
- write_uint32(36 + length) # File size - 8
358
- write_string('WAVE')
359
-
360
- # fmt chunk
361
- write_string('fmt ')
362
- write_uint32(16) # Subchunk1Size (PCM)
363
- write_uint16(1) # AudioFormat (PCM = 1)
364
- write_uint16(1) # NumChannels (mono)
365
- write_uint32(sample_rate) # SampleRate
366
- write_uint32(sample_rate * 1 * 2) # ByteRate
367
- write_uint16(1 * 2) # BlockAlign
368
- write_uint16(16) # BitsPerSample
369
-
370
- # data chunk
371
- write_string('data')
372
- write_uint32(length) # Subchunk2Size
373
-
374
- # Audio data
375
- wav_buffer.write(audio_data)
376
-
377
- wav_data = wav_buffer.getvalue()
378
- wav_buffer.close()
379
-
380
- # ✅ Debug: WAV header'ını kontrol et
381
- if len(wav_data) >= 44:
382
- header_bytes = wav_data[:44]
383
- log_info(f"🔍 WAV header (first 44 bytes): {header_bytes.hex()}")
384
-
385
- # Header parse et
386
- riff = header_bytes[0:4].decode('ascii')
387
- file_size = struct.unpack('<I', header_bytes[4:8])[0]
388
- wave = header_bytes[8:12].decode('ascii')
389
- fmt_chunk = header_bytes[12:16].decode('ascii')
390
- fmt_size = struct.unpack('<I', header_bytes[16:20])[0]
391
- audio_format = struct.unpack('<H', header_bytes[20:22])[0]
392
- channels = struct.unpack('<H', header_bytes[22:24])[0]
393
- sample_rate_check = struct.unpack('<I', header_bytes[24:28])[0]
394
- byte_rate = struct.unpack('<I', header_bytes[28:32])[0]
395
- block_align = struct.unpack('<H', header_bytes[32:34])[0]
396
- bits_per_sample = struct.unpack('<H', header_bytes[34:36])[0]
397
- data_chunk = header_bytes[36:40].decode('ascii')
398
- data_size = struct.unpack('<I', header_bytes[40:44])[0]
399
-
400
- log_info(f"🔍 WAV Header Analysis:")
401
- log_info(f" RIFF: {riff}")
402
- log_info(f" File Size: {file_size}")
403
- log_info(f" WAVE: {wave}")
404
- log_info(f" FMT Chunk: {fmt_chunk}")
405
- log_info(f" Audio Format: {audio_format} (should be 1)")
406
- log_info(f" Channels: {channels} (should be 1)")
407
- log_info(f" Sample Rate: {sample_rate_check} (should be {sample_rate})")
408
- log_info(f" Byte Rate: {byte_rate}")
409
- log_info(f" Block Align: {block_align}")
410
- log_info(f" Bits Per Sample: {bits_per_sample}")
411
- log_info(f" Data Chunk: {data_chunk}")
412
- log_info(f" Data Size: {data_size} (should be {length})")
413
-
414
- # ✅ Validation
415
- if riff != 'RIFF':
416
- log_error(f"❌ Invalid RIFF header: {riff}")
417
- if wave != 'WAVE':
418
- log_error(f"❌ Invalid WAVE header: {wave}")
419
- if audio_format != 1:
420
- log_error(f"❌ Invalid audio format: {audio_format}")
421
- if channels != 1:
422
- log_error(f"❌ Invalid channel count: {channels}")
423
- if sample_rate_check != sample_rate:
424
- log_error(f"❌ Invalid sample rate: {sample_rate_check}")
425
- if data_size != length:
426
- log_error(f"❌ Invalid data size: {data_size} vs {length}")
427
-
428
- # ✅ Debug: WAV dosyasını geçici olarak kaydet (test için)
429
- import tempfile
430
- import os
431
-
432
- temp_file = tempfile.mktemp(suffix='.wav')
433
- try:
434
- with open(temp_file, 'wb') as f:
435
- f.write(wav_data)
436
-
437
- # WAV dosyasının gerçekten valid olduğunu kontrol et
438
- import wave
439
- with wave.open(temp_file, 'rb') as wav_file:
440
- wav_channels = wav_file.getnchannels()
441
- wav_sample_width = wav_file.getsampwidth()
442
- wav_sample_rate = wav_file.getframerate()
443
- wav_frames = wav_file.getnframes()
444
-
445
- log_info(f"🔍 WAV File Validation:")
446
- log_info(f" Channels: {wav_channels}")
447
- log_info(f" Sample Width: {wav_sample_width}")
448
- log_info(f" Sample Rate: {wav_sample_rate}")
449
- log_info(f" Frames: {wav_frames}")
450
- log_info(f" Duration: {wav_frames / wav_sample_rate:.2f}s")
451
-
452
- # İlk birkaç frame'i oku
453
- first_frames = wav_file.readframes(10)
454
- if first_frames:
455
- first_samples_wav = struct.unpack('<10h', first_frames[:20])
456
- log_info(f"🔍 First 10 samples from WAV: {first_samples_wav}")
457
-
458
- log_info(f"✅ WAV file created and validated: {temp_file}")
459
-
460
- except Exception as e:
461
- log_error(f"❌ WAV validation failed: {e}")
462
- finally:
463
- # Cleanup
464
- if os.path.exists(temp_file):
465
- os.unlink(temp_file)
466
-
467
- log_info(f"🔧 WAV specs: 1ch, {sample_rate}Hz, 16bit")
468
 
469
  return wav_data
470
 
471
- except Exception as e:
472
- log_error(f"❌ WAV conversion failed: {e}")
473
- import traceback
474
- log_error(f"Traceback: {traceback.format_exc()}")
475
- # Fallback to raw PCM
476
- return audio_data
 
 
 
477
 
478
  def get_supported_languages(self) -> List[str]:
479
  """Get list of supported language codes"""
 
177
  log_error(f"❌ Silence trimming failed: {e}")
178
  return audio_data
179
 
180
+ async def transcribe(self, audio_data: bytes, config: STTConfig) -> Optional[TranscriptionResult]:
181
  """Transcribe audio data using Google Cloud Speech API"""
182
  try:
183
  # Check if we have audio to transcribe
 
189
 
190
  # ✅ Audio analizi
191
  self._analyze_audio_content(audio_data)
192
+
193
  # ✅ Silence trimming ekle
194
  trimmed_audio = self._trim_silence(audio_data)
195
 
196
+ if len(trimmed_audio) < 8000: # 0.5 saniyeden az
197
+ log_warning("⚠️ Audio too short after trimming")
198
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
 
200
+ # Test kodundan EXACT aynı format - wave modülü kullan
201
+ wav_audio = self._create_wav_like_test(trimmed_audio, config.sample_rate)
202
+ log_info(f"🔧 WAV conversion: {len(trimmed_audio)} PCM → {len(wav_audio)} WAV")
 
 
 
203
 
204
+ # Configure recognition - TEST KODUNDAN EXACT AYNI
 
 
 
205
  recognition_config = RecognitionConfig(
206
  encoding=RecognitionConfig.AudioEncoding.LINEAR16,
207
  sample_rate_hertz=config.sample_rate,
208
+ language_code="tr-TR", # Hardcode tr-TR like test
209
  audio_channel_count=1,
210
  enable_separate_recognition_per_channel=False,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
  )
212
 
213
+ log_debug(f"Recognition config: language=tr-TR, sample_rate={config.sample_rate}")
214
 
215
  # ✅ Create audio object with WAV data
216
  audio = RecognitionAudio(content=wav_audio)
 
248
  if result.alternatives:
249
  alternative = result.alternatives[0]
250
 
 
 
 
 
 
 
 
 
 
 
 
 
251
  transcription = TranscriptionResult(
252
  text=alternative.transcript,
253
  confidence=alternative.confidence,
254
  timestamp=datetime.now().timestamp(),
255
+ language="tr-TR",
256
+ word_timestamps=None
257
  )
258
 
259
  log_info(f"✅ Transcription: '{alternative.transcript}' (confidence: {alternative.confidence:.2f})")
 
267
  import traceback
268
  log_error(f"Traceback: {traceback.format_exc()}")
269
  return None
270
+
271
+ def _create_wav_like_test(self, audio_data: bytes, sample_rate: int) -> bytes:
272
+ """Create WAV exactly like test code using wave module"""
273
+ try:
274
+ import tempfile
275
+ import os
276
+ import wave
277
+
278
+ # Geçici dosya oluştur
279
+ temp_wav = tempfile.mktemp(suffix='.wav')
280
+
281
  try:
282
+ # Wave file oluştur - test kodundaki gibi
283
+ with wave.open(temp_wav, 'wb') as wav_file:
284
+ wav_file.setnchannels(1) # Mono
285
+ wav_file.setsampwidth(2) # 16-bit
286
+ wav_file.setframerate(sample_rate) # 16kHz
287
+ wav_file.writeframes(audio_data)
288
 
289
+ # Dosyayı geri oku
290
+ with open(temp_wav, 'rb') as f:
291
+ wav_data = f.read()
 
 
292
 
293
+ log_info(f"🔧 WAV created using wave module: {len(wav_data)} bytes")
 
294
 
295
+ # Debug: Wave file'ı kontrol et
296
+ with wave.open(temp_wav, 'rb') as wav_file:
297
+ log_info(f"🔧 Wave validation: {wav_file.getnchannels()}ch, {wav_file.getframerate()}Hz, {wav_file.getnframes()} frames")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
298
 
299
  return wav_data
300
 
301
+ finally:
302
+ # Cleanup
303
+ if os.path.exists(temp_wav):
304
+ os.unlink(temp_wav)
305
+
306
+ except Exception as e:
307
+ log_error(f"❌ Wave module WAV creation failed: {e}")
308
+ # Fallback to manual method
309
+ return self._convert_to_wav_proper(audio_data, sample_rate)
310
 
311
  def get_supported_languages(self) -> List[str]:
312
  """Get list of supported language codes"""