Spaces:
Building
Building
Update stt/stt_google.py
Browse files- stt/stt_google.py +146 -52
stt/stt_google.py
CHANGED
@@ -240,58 +240,152 @@ class GoogleSTT(STTInterface):
|
|
240 |
return None
|
241 |
|
242 |
def _convert_to_wav_proper(self, audio_data: bytes, sample_rate: int) -> bytes:
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
295 |
|
296 |
def get_supported_languages(self) -> List[str]:
|
297 |
"""Get list of supported language codes"""
|
|
|
240 |
return None
|
241 |
|
242 |
def _convert_to_wav_proper(self, audio_data: bytes, sample_rate: int) -> bytes:
|
243 |
+
"""Convert raw PCM to proper WAV format - EXACTLY like test code"""
|
244 |
+
try:
|
245 |
+
# ✅ Test kodundan aynı WAV header oluşturma
|
246 |
+
length = len(audio_data)
|
247 |
+
|
248 |
+
# ✅ Debug: İlk birkaç byte'ı kontrol et
|
249 |
+
if length >= 20:
|
250 |
+
first_samples = struct.unpack('<10h', audio_data[:20])
|
251 |
+
log_info(f"🔍 First 10 PCM samples: {first_samples}")
|
252 |
+
log_info(f"🔍 Max amplitude in first 10: {max(abs(s) for s in first_samples)}")
|
253 |
+
|
254 |
+
# ✅ BytesIO kullanarak memory'de WAV oluştur
|
255 |
+
wav_buffer = io.BytesIO()
|
256 |
+
|
257 |
+
# ✅ Test kodundan aynı header yazma
|
258 |
+
def write_string(data: str):
|
259 |
+
wav_buffer.write(data.encode('ascii'))
|
260 |
+
|
261 |
+
def write_uint32(value: int):
|
262 |
+
wav_buffer.write(struct.pack('<I', value))
|
263 |
+
|
264 |
+
def write_uint16(value: int):
|
265 |
+
wav_buffer.write(struct.pack('<H', value))
|
266 |
+
|
267 |
+
# RIFF header
|
268 |
+
write_string('RIFF')
|
269 |
+
write_uint32(36 + length) # File size - 8
|
270 |
+
write_string('WAVE')
|
271 |
+
|
272 |
+
# fmt chunk
|
273 |
+
write_string('fmt ')
|
274 |
+
write_uint32(16) # Subchunk1Size (PCM)
|
275 |
+
write_uint16(1) # AudioFormat (PCM = 1)
|
276 |
+
write_uint16(1) # NumChannels (mono)
|
277 |
+
write_uint32(sample_rate) # SampleRate
|
278 |
+
write_uint32(sample_rate * 1 * 2) # ByteRate
|
279 |
+
write_uint16(1 * 2) # BlockAlign
|
280 |
+
write_uint16(16) # BitsPerSample
|
281 |
+
|
282 |
+
# data chunk
|
283 |
+
write_string('data')
|
284 |
+
write_uint32(length) # Subchunk2Size
|
285 |
+
|
286 |
+
# Audio data
|
287 |
+
wav_buffer.write(audio_data)
|
288 |
+
|
289 |
+
wav_data = wav_buffer.getvalue()
|
290 |
+
wav_buffer.close()
|
291 |
+
|
292 |
+
# ✅ Debug: WAV header'ını kontrol et
|
293 |
+
if len(wav_data) >= 44:
|
294 |
+
header_bytes = wav_data[:44]
|
295 |
+
log_info(f"🔍 WAV header (first 44 bytes): {header_bytes.hex()}")
|
296 |
+
|
297 |
+
# Header parse et
|
298 |
+
riff = header_bytes[0:4].decode('ascii')
|
299 |
+
file_size = struct.unpack('<I', header_bytes[4:8])[0]
|
300 |
+
wave = header_bytes[8:12].decode('ascii')
|
301 |
+
fmt_chunk = header_bytes[12:16].decode('ascii')
|
302 |
+
fmt_size = struct.unpack('<I', header_bytes[16:20])[0]
|
303 |
+
audio_format = struct.unpack('<H', header_bytes[20:22])[0]
|
304 |
+
channels = struct.unpack('<H', header_bytes[22:24])[0]
|
305 |
+
sample_rate_check = struct.unpack('<I', header_bytes[24:28])[0]
|
306 |
+
byte_rate = struct.unpack('<I', header_bytes[28:32])[0]
|
307 |
+
block_align = struct.unpack('<H', header_bytes[32:34])[0]
|
308 |
+
bits_per_sample = struct.unpack('<H', header_bytes[34:36])[0]
|
309 |
+
data_chunk = header_bytes[36:40].decode('ascii')
|
310 |
+
data_size = struct.unpack('<I', header_bytes[40:44])[0]
|
311 |
+
|
312 |
+
log_info(f"🔍 WAV Header Analysis:")
|
313 |
+
log_info(f" RIFF: {riff}")
|
314 |
+
log_info(f" File Size: {file_size}")
|
315 |
+
log_info(f" WAVE: {wave}")
|
316 |
+
log_info(f" FMT Chunk: {fmt_chunk}")
|
317 |
+
log_info(f" Audio Format: {audio_format} (should be 1)")
|
318 |
+
log_info(f" Channels: {channels} (should be 1)")
|
319 |
+
log_info(f" Sample Rate: {sample_rate_check} (should be {sample_rate})")
|
320 |
+
log_info(f" Byte Rate: {byte_rate}")
|
321 |
+
log_info(f" Block Align: {block_align}")
|
322 |
+
log_info(f" Bits Per Sample: {bits_per_sample}")
|
323 |
+
log_info(f" Data Chunk: {data_chunk}")
|
324 |
+
log_info(f" Data Size: {data_size} (should be {length})")
|
325 |
+
|
326 |
+
# ✅ Validation
|
327 |
+
if riff != 'RIFF':
|
328 |
+
log_error(f"❌ Invalid RIFF header: {riff}")
|
329 |
+
if wave != 'WAVE':
|
330 |
+
log_error(f"❌ Invalid WAVE header: {wave}")
|
331 |
+
if audio_format != 1:
|
332 |
+
log_error(f"❌ Invalid audio format: {audio_format}")
|
333 |
+
if channels != 1:
|
334 |
+
log_error(f"❌ Invalid channel count: {channels}")
|
335 |
+
if sample_rate_check != sample_rate:
|
336 |
+
log_error(f"❌ Invalid sample rate: {sample_rate_check}")
|
337 |
+
if data_size != length:
|
338 |
+
log_error(f"❌ Invalid data size: {data_size} vs {length}")
|
339 |
+
|
340 |
+
# ✅ Debug: WAV dosyasını geçici olarak kaydet (test için)
|
341 |
+
import tempfile
|
342 |
+
import os
|
343 |
+
|
344 |
+
temp_file = tempfile.mktemp(suffix='.wav')
|
345 |
+
try:
|
346 |
+
with open(temp_file, 'wb') as f:
|
347 |
+
f.write(wav_data)
|
348 |
+
|
349 |
+
# WAV dosyasının gerçekten valid olduğunu kontrol et
|
350 |
+
import wave
|
351 |
+
with wave.open(temp_file, 'rb') as wav_file:
|
352 |
+
wav_channels = wav_file.getnchannels()
|
353 |
+
wav_sample_width = wav_file.getsampwidth()
|
354 |
+
wav_sample_rate = wav_file.getframerate()
|
355 |
+
wav_frames = wav_file.getnframes()
|
356 |
+
|
357 |
+
log_info(f"🔍 WAV File Validation:")
|
358 |
+
log_info(f" Channels: {wav_channels}")
|
359 |
+
log_info(f" Sample Width: {wav_sample_width}")
|
360 |
+
log_info(f" Sample Rate: {wav_sample_rate}")
|
361 |
+
log_info(f" Frames: {wav_frames}")
|
362 |
+
log_info(f" Duration: {wav_frames / wav_sample_rate:.2f}s")
|
363 |
+
|
364 |
+
# İlk birkaç frame'i oku
|
365 |
+
first_frames = wav_file.readframes(10)
|
366 |
+
if first_frames:
|
367 |
+
first_samples_wav = struct.unpack('<10h', first_frames[:20])
|
368 |
+
log_info(f"🔍 First 10 samples from WAV: {first_samples_wav}")
|
369 |
+
|
370 |
+
log_info(f"✅ WAV file created and validated: {temp_file}")
|
371 |
+
|
372 |
+
except Exception as e:
|
373 |
+
log_error(f"❌ WAV validation failed: {e}")
|
374 |
+
finally:
|
375 |
+
# Cleanup
|
376 |
+
if os.path.exists(temp_file):
|
377 |
+
os.unlink(temp_file)
|
378 |
+
|
379 |
+
log_info(f"🔧 WAV specs: 1ch, {sample_rate}Hz, 16bit")
|
380 |
+
|
381 |
+
return wav_data
|
382 |
+
|
383 |
+
except Exception as e:
|
384 |
+
log_error(f"❌ WAV conversion failed: {e}")
|
385 |
+
import traceback
|
386 |
+
log_error(f"Traceback: {traceback.format_exc()}")
|
387 |
+
# Fallback to raw PCM
|
388 |
+
return audio_data
|
389 |
|
390 |
def get_supported_languages(self) -> List[str]:
|
391 |
"""Get list of supported language codes"""
|