Spaces:
Building
Building
Update stt/stt_google.py
Browse files- stt/stt_google.py +46 -213
stt/stt_google.py
CHANGED
@@ -177,7 +177,7 @@ class GoogleSTT(STTInterface):
|
|
177 |
log_error(f"❌ Silence trimming failed: {e}")
|
178 |
return audio_data
|
179 |
|
180 |
-
|
181 |
"""Transcribe audio data using Google Cloud Speech API"""
|
182 |
try:
|
183 |
# Check if we have audio to transcribe
|
@@ -189,75 +189,28 @@ class GoogleSTT(STTInterface):
|
|
189 |
|
190 |
# ✅ Audio analizi
|
191 |
self._analyze_audio_content(audio_data)
|
192 |
-
|
193 |
# ✅ Silence trimming ekle
|
194 |
trimmed_audio = self._trim_silence(audio_data)
|
195 |
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
import tempfile
|
201 |
-
import os
|
202 |
-
|
203 |
-
# Raw PCM kaydet
|
204 |
-
pcm_file = tempfile.mktemp(suffix='.pcm')
|
205 |
-
with open(pcm_file, 'wb') as f:
|
206 |
-
f.write(trimmed_audio)
|
207 |
-
log_info(f"🔍 Raw PCM saved to: {pcm_file}")
|
208 |
-
|
209 |
-
# WAV kaydet
|
210 |
-
wav_file = tempfile.mktemp(suffix='.wav')
|
211 |
-
with open(wav_file, 'wb') as f:
|
212 |
-
f.write(wav_audio)
|
213 |
-
log_info(f"🔍 WAV saved to: {wav_file}")
|
214 |
-
|
215 |
-
# Test kodunla aynı şekilde test et
|
216 |
-
try:
|
217 |
-
import subprocess
|
218 |
-
result = subprocess.run([
|
219 |
-
'python', 'app.py', wav_file
|
220 |
-
], capture_output=True, text=True, timeout=30)
|
221 |
-
log_info(f"🔍 Test script result: {result.stdout}")
|
222 |
-
if result.stderr:
|
223 |
-
log_error(f"🔍 Test script error: {result.stderr}")
|
224 |
-
except Exception as e:
|
225 |
-
log_warning(f"Could not run test script: {e}")
|
226 |
|
227 |
-
#
|
228 |
-
|
229 |
-
|
230 |
-
os.unlink(wav_file)
|
231 |
-
except:
|
232 |
-
pass
|
233 |
|
234 |
-
# Configure recognition
|
235 |
-
language_code = self._map_language_code(config.language)
|
236 |
-
|
237 |
-
# ✅ WAV formatı için doğru config
|
238 |
recognition_config = RecognitionConfig(
|
239 |
encoding=RecognitionConfig.AudioEncoding.LINEAR16,
|
240 |
sample_rate_hertz=config.sample_rate,
|
241 |
-
language_code=
|
242 |
audio_channel_count=1,
|
243 |
enable_separate_recognition_per_channel=False,
|
244 |
-
# ✅ Enhanced model kullan
|
245 |
-
model="latest_long",
|
246 |
-
use_enhanced=True,
|
247 |
-
# ✅ Punctuation ekle
|
248 |
-
enable_automatic_punctuation=config.enable_punctuation if hasattr(config, 'enable_punctuation') else True,
|
249 |
-
# ✅ Profanity filter'ı kapat (daha iyi tanıma için)
|
250 |
-
profanity_filter=False,
|
251 |
-
# ✅ Audio analizi için metadata
|
252 |
-
metadata=speech.RecognitionMetadata(
|
253 |
-
interaction_type=speech.RecognitionMetadata.InteractionType.VOICE_COMMAND,
|
254 |
-
microphone_distance=speech.RecognitionMetadata.MicrophoneDistance.NEARFIELD,
|
255 |
-
original_media_type=speech.RecognitionMetadata.OriginalMediaType.AUDIO,
|
256 |
-
recording_device_type=speech.RecognitionMetadata.RecordingDeviceType.PC
|
257 |
-
)
|
258 |
)
|
259 |
|
260 |
-
log_debug(f"Recognition config: language=
|
261 |
|
262 |
# ✅ Create audio object with WAV data
|
263 |
audio = RecognitionAudio(content=wav_audio)
|
@@ -295,24 +248,12 @@ class GoogleSTT(STTInterface):
|
|
295 |
if result.alternatives:
|
296 |
alternative = result.alternatives[0]
|
297 |
|
298 |
-
# Extract word timestamps if available
|
299 |
-
word_timestamps = None
|
300 |
-
if config.enable_word_timestamps and hasattr(alternative, 'words'):
|
301 |
-
word_timestamps = [
|
302 |
-
{
|
303 |
-
"word": word_info.word,
|
304 |
-
"start_time": word_info.start_time.total_seconds(),
|
305 |
-
"end_time": word_info.end_time.total_seconds()
|
306 |
-
}
|
307 |
-
for word_info in alternative.words
|
308 |
-
]
|
309 |
-
|
310 |
transcription = TranscriptionResult(
|
311 |
text=alternative.transcript,
|
312 |
confidence=alternative.confidence,
|
313 |
timestamp=datetime.now().timestamp(),
|
314 |
-
language=
|
315 |
-
word_timestamps=
|
316 |
)
|
317 |
|
318 |
log_info(f"✅ Transcription: '{alternative.transcript}' (confidence: {alternative.confidence:.2f})")
|
@@ -326,154 +267,46 @@ class GoogleSTT(STTInterface):
|
|
326 |
import traceback
|
327 |
log_error(f"Traceback: {traceback.format_exc()}")
|
328 |
return None
|
329 |
-
|
330 |
-
def
|
331 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
332 |
try:
|
333 |
-
#
|
334 |
-
|
|
|
|
|
|
|
|
|
335 |
|
336 |
-
#
|
337 |
-
|
338 |
-
|
339 |
-
log_info(f"🔍 First 10 PCM samples: {first_samples}")
|
340 |
-
log_info(f"🔍 Max amplitude in first 10: {max(abs(s) for s in first_samples)}")
|
341 |
|
342 |
-
|
343 |
-
wav_buffer = io.BytesIO()
|
344 |
|
345 |
-
#
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
def write_uint32(value: int):
|
350 |
-
wav_buffer.write(struct.pack('<I', value))
|
351 |
-
|
352 |
-
def write_uint16(value: int):
|
353 |
-
wav_buffer.write(struct.pack('<H', value))
|
354 |
-
|
355 |
-
# RIFF header
|
356 |
-
write_string('RIFF')
|
357 |
-
write_uint32(36 + length) # File size - 8
|
358 |
-
write_string('WAVE')
|
359 |
-
|
360 |
-
# fmt chunk
|
361 |
-
write_string('fmt ')
|
362 |
-
write_uint32(16) # Subchunk1Size (PCM)
|
363 |
-
write_uint16(1) # AudioFormat (PCM = 1)
|
364 |
-
write_uint16(1) # NumChannels (mono)
|
365 |
-
write_uint32(sample_rate) # SampleRate
|
366 |
-
write_uint32(sample_rate * 1 * 2) # ByteRate
|
367 |
-
write_uint16(1 * 2) # BlockAlign
|
368 |
-
write_uint16(16) # BitsPerSample
|
369 |
-
|
370 |
-
# data chunk
|
371 |
-
write_string('data')
|
372 |
-
write_uint32(length) # Subchunk2Size
|
373 |
-
|
374 |
-
# Audio data
|
375 |
-
wav_buffer.write(audio_data)
|
376 |
-
|
377 |
-
wav_data = wav_buffer.getvalue()
|
378 |
-
wav_buffer.close()
|
379 |
-
|
380 |
-
# ✅ Debug: WAV header'ını kontrol et
|
381 |
-
if len(wav_data) >= 44:
|
382 |
-
header_bytes = wav_data[:44]
|
383 |
-
log_info(f"🔍 WAV header (first 44 bytes): {header_bytes.hex()}")
|
384 |
-
|
385 |
-
# Header parse et
|
386 |
-
riff = header_bytes[0:4].decode('ascii')
|
387 |
-
file_size = struct.unpack('<I', header_bytes[4:8])[0]
|
388 |
-
wave = header_bytes[8:12].decode('ascii')
|
389 |
-
fmt_chunk = header_bytes[12:16].decode('ascii')
|
390 |
-
fmt_size = struct.unpack('<I', header_bytes[16:20])[0]
|
391 |
-
audio_format = struct.unpack('<H', header_bytes[20:22])[0]
|
392 |
-
channels = struct.unpack('<H', header_bytes[22:24])[0]
|
393 |
-
sample_rate_check = struct.unpack('<I', header_bytes[24:28])[0]
|
394 |
-
byte_rate = struct.unpack('<I', header_bytes[28:32])[0]
|
395 |
-
block_align = struct.unpack('<H', header_bytes[32:34])[0]
|
396 |
-
bits_per_sample = struct.unpack('<H', header_bytes[34:36])[0]
|
397 |
-
data_chunk = header_bytes[36:40].decode('ascii')
|
398 |
-
data_size = struct.unpack('<I', header_bytes[40:44])[0]
|
399 |
-
|
400 |
-
log_info(f"🔍 WAV Header Analysis:")
|
401 |
-
log_info(f" RIFF: {riff}")
|
402 |
-
log_info(f" File Size: {file_size}")
|
403 |
-
log_info(f" WAVE: {wave}")
|
404 |
-
log_info(f" FMT Chunk: {fmt_chunk}")
|
405 |
-
log_info(f" Audio Format: {audio_format} (should be 1)")
|
406 |
-
log_info(f" Channels: {channels} (should be 1)")
|
407 |
-
log_info(f" Sample Rate: {sample_rate_check} (should be {sample_rate})")
|
408 |
-
log_info(f" Byte Rate: {byte_rate}")
|
409 |
-
log_info(f" Block Align: {block_align}")
|
410 |
-
log_info(f" Bits Per Sample: {bits_per_sample}")
|
411 |
-
log_info(f" Data Chunk: {data_chunk}")
|
412 |
-
log_info(f" Data Size: {data_size} (should be {length})")
|
413 |
-
|
414 |
-
# ✅ Validation
|
415 |
-
if riff != 'RIFF':
|
416 |
-
log_error(f"❌ Invalid RIFF header: {riff}")
|
417 |
-
if wave != 'WAVE':
|
418 |
-
log_error(f"❌ Invalid WAVE header: {wave}")
|
419 |
-
if audio_format != 1:
|
420 |
-
log_error(f"❌ Invalid audio format: {audio_format}")
|
421 |
-
if channels != 1:
|
422 |
-
log_error(f"❌ Invalid channel count: {channels}")
|
423 |
-
if sample_rate_check != sample_rate:
|
424 |
-
log_error(f"❌ Invalid sample rate: {sample_rate_check}")
|
425 |
-
if data_size != length:
|
426 |
-
log_error(f"❌ Invalid data size: {data_size} vs {length}")
|
427 |
-
|
428 |
-
# ✅ Debug: WAV dosyasını geçici olarak kaydet (test için)
|
429 |
-
import tempfile
|
430 |
-
import os
|
431 |
-
|
432 |
-
temp_file = tempfile.mktemp(suffix='.wav')
|
433 |
-
try:
|
434 |
-
with open(temp_file, 'wb') as f:
|
435 |
-
f.write(wav_data)
|
436 |
-
|
437 |
-
# WAV dosyasının gerçekten valid olduğunu kontrol et
|
438 |
-
import wave
|
439 |
-
with wave.open(temp_file, 'rb') as wav_file:
|
440 |
-
wav_channels = wav_file.getnchannels()
|
441 |
-
wav_sample_width = wav_file.getsampwidth()
|
442 |
-
wav_sample_rate = wav_file.getframerate()
|
443 |
-
wav_frames = wav_file.getnframes()
|
444 |
-
|
445 |
-
log_info(f"🔍 WAV File Validation:")
|
446 |
-
log_info(f" Channels: {wav_channels}")
|
447 |
-
log_info(f" Sample Width: {wav_sample_width}")
|
448 |
-
log_info(f" Sample Rate: {wav_sample_rate}")
|
449 |
-
log_info(f" Frames: {wav_frames}")
|
450 |
-
log_info(f" Duration: {wav_frames / wav_sample_rate:.2f}s")
|
451 |
-
|
452 |
-
# İlk birkaç frame'i oku
|
453 |
-
first_frames = wav_file.readframes(10)
|
454 |
-
if first_frames:
|
455 |
-
first_samples_wav = struct.unpack('<10h', first_frames[:20])
|
456 |
-
log_info(f"🔍 First 10 samples from WAV: {first_samples_wav}")
|
457 |
-
|
458 |
-
log_info(f"✅ WAV file created and validated: {temp_file}")
|
459 |
-
|
460 |
-
except Exception as e:
|
461 |
-
log_error(f"❌ WAV validation failed: {e}")
|
462 |
-
finally:
|
463 |
-
# Cleanup
|
464 |
-
if os.path.exists(temp_file):
|
465 |
-
os.unlink(temp_file)
|
466 |
-
|
467 |
-
log_info(f"🔧 WAV specs: 1ch, {sample_rate}Hz, 16bit")
|
468 |
|
469 |
return wav_data
|
470 |
|
471 |
-
|
472 |
-
|
473 |
-
|
474 |
-
|
475 |
-
|
476 |
-
|
|
|
|
|
|
|
477 |
|
478 |
def get_supported_languages(self) -> List[str]:
|
479 |
"""Get list of supported language codes"""
|
|
|
177 |
log_error(f"❌ Silence trimming failed: {e}")
|
178 |
return audio_data
|
179 |
|
180 |
+
async def transcribe(self, audio_data: bytes, config: STTConfig) -> Optional[TranscriptionResult]:
|
181 |
"""Transcribe audio data using Google Cloud Speech API"""
|
182 |
try:
|
183 |
# Check if we have audio to transcribe
|
|
|
189 |
|
190 |
# ✅ Audio analizi
|
191 |
self._analyze_audio_content(audio_data)
|
192 |
+
|
193 |
# ✅ Silence trimming ekle
|
194 |
trimmed_audio = self._trim_silence(audio_data)
|
195 |
|
196 |
+
if len(trimmed_audio) < 8000: # 0.5 saniyeden az
|
197 |
+
log_warning("⚠️ Audio too short after trimming")
|
198 |
+
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
199 |
|
200 |
+
# ✅ Test kodundan EXACT aynı format - wave modülü kullan
|
201 |
+
wav_audio = self._create_wav_like_test(trimmed_audio, config.sample_rate)
|
202 |
+
log_info(f"🔧 WAV conversion: {len(trimmed_audio)} PCM → {len(wav_audio)} WAV")
|
|
|
|
|
|
|
203 |
|
204 |
+
# Configure recognition - TEST KODUNDAN EXACT AYNI
|
|
|
|
|
|
|
205 |
recognition_config = RecognitionConfig(
|
206 |
encoding=RecognitionConfig.AudioEncoding.LINEAR16,
|
207 |
sample_rate_hertz=config.sample_rate,
|
208 |
+
language_code="tr-TR", # Hardcode tr-TR like test
|
209 |
audio_channel_count=1,
|
210 |
enable_separate_recognition_per_channel=False,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
211 |
)
|
212 |
|
213 |
+
log_debug(f"Recognition config: language=tr-TR, sample_rate={config.sample_rate}")
|
214 |
|
215 |
# ✅ Create audio object with WAV data
|
216 |
audio = RecognitionAudio(content=wav_audio)
|
|
|
248 |
if result.alternatives:
|
249 |
alternative = result.alternatives[0]
|
250 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
251 |
transcription = TranscriptionResult(
|
252 |
text=alternative.transcript,
|
253 |
confidence=alternative.confidence,
|
254 |
timestamp=datetime.now().timestamp(),
|
255 |
+
language="tr-TR",
|
256 |
+
word_timestamps=None
|
257 |
)
|
258 |
|
259 |
log_info(f"✅ Transcription: '{alternative.transcript}' (confidence: {alternative.confidence:.2f})")
|
|
|
267 |
import traceback
|
268 |
log_error(f"Traceback: {traceback.format_exc()}")
|
269 |
return None
|
270 |
+
|
271 |
+
def _create_wav_like_test(self, audio_data: bytes, sample_rate: int) -> bytes:
|
272 |
+
"""Create WAV exactly like test code using wave module"""
|
273 |
+
try:
|
274 |
+
import tempfile
|
275 |
+
import os
|
276 |
+
import wave
|
277 |
+
|
278 |
+
# Geçici dosya oluştur
|
279 |
+
temp_wav = tempfile.mktemp(suffix='.wav')
|
280 |
+
|
281 |
try:
|
282 |
+
# Wave file oluştur - test kodundaki gibi
|
283 |
+
with wave.open(temp_wav, 'wb') as wav_file:
|
284 |
+
wav_file.setnchannels(1) # Mono
|
285 |
+
wav_file.setsampwidth(2) # 16-bit
|
286 |
+
wav_file.setframerate(sample_rate) # 16kHz
|
287 |
+
wav_file.writeframes(audio_data)
|
288 |
|
289 |
+
# Dosyayı geri oku
|
290 |
+
with open(temp_wav, 'rb') as f:
|
291 |
+
wav_data = f.read()
|
|
|
|
|
292 |
|
293 |
+
log_info(f"🔧 WAV created using wave module: {len(wav_data)} bytes")
|
|
|
294 |
|
295 |
+
# Debug: Wave file'ı kontrol et
|
296 |
+
with wave.open(temp_wav, 'rb') as wav_file:
|
297 |
+
log_info(f"🔧 Wave validation: {wav_file.getnchannels()}ch, {wav_file.getframerate()}Hz, {wav_file.getnframes()} frames")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
298 |
|
299 |
return wav_data
|
300 |
|
301 |
+
finally:
|
302 |
+
# Cleanup
|
303 |
+
if os.path.exists(temp_wav):
|
304 |
+
os.unlink(temp_wav)
|
305 |
+
|
306 |
+
except Exception as e:
|
307 |
+
log_error(f"❌ Wave module WAV creation failed: {e}")
|
308 |
+
# Fallback to manual method
|
309 |
+
return self._convert_to_wav_proper(audio_data, sample_rate)
|
310 |
|
311 |
def get_supported_languages(self) -> List[str]:
|
312 |
"""Get list of supported language codes"""
|