Spaces:
Building
Building
Update stt/stt_google.py
Browse files- stt/stt_google.py +111 -145
stt/stt_google.py
CHANGED
@@ -79,29 +79,66 @@ class GoogleCloudSTT(STTInterface):
|
|
79 |
}
|
80 |
return encoding_map.get(encoding_str, speech.RecognitionConfig.AudioEncoding.WEBM_OPUS)
|
81 |
|
82 |
-
async def stream_audio(self, audio_chunk: bytes) ->
|
83 |
-
"""Stream audio chunk and get
|
84 |
if not self.is_streaming:
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
try:
|
89 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
self.audio_queue.put(audio_chunk)
|
91 |
-
|
92 |
-
|
93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
try:
|
95 |
-
# Non-blocking get from queue
|
96 |
result = self.responses_queue.get_nowait()
|
|
|
97 |
yield result
|
98 |
except queue.Empty:
|
99 |
-
# No
|
|
|
|
|
|
|
100 |
break
|
101 |
-
|
102 |
except Exception as e:
|
103 |
-
log_error(f"β
|
104 |
-
self.is_streaming = False
|
105 |
raise
|
106 |
|
107 |
async def stop_streaming(self) -> Optional[TranscriptionResult]:
|
@@ -343,142 +380,71 @@ class GoogleCloudSTT(STTInterface):
|
|
343 |
log_error(f"β Error queuing result: {e}")
|
344 |
|
345 |
def _run_stream(self):
|
346 |
-
"""Run the streaming recognition in a separate thread"""
|
347 |
try:
|
348 |
-
log_info(
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
total_bytes = 0
|
354 |
-
|
355 |
-
while not self.stop_event.is_set():
|
356 |
-
try:
|
357 |
-
chunk = self.audio_queue.get(timeout=0.1)
|
358 |
-
if chunk is None:
|
359 |
-
log_info("π Poison pill received, stopping request generator")
|
360 |
-
break
|
361 |
-
|
362 |
-
chunk_count += 1
|
363 |
-
total_bytes += len(chunk)
|
364 |
-
|
365 |
-
# Δ°lk chunk'ta audio format kontrolΓΌ
|
366 |
-
if chunk_count == 1:
|
367 |
-
log_info(f"π€ First chunk - size: {len(chunk)} bytes")
|
368 |
-
if len(chunk) >= 4 and chunk[:4] == b'\x1a\x45\xdf\xa3':
|
369 |
-
log_info("β
Valid WEBM header detected")
|
370 |
-
|
371 |
-
# Her 50 chunk'ta durum raporu
|
372 |
-
if chunk_count % 50 == 0:
|
373 |
-
log_info(f"π€ Progress: {chunk_count} chunks, {total_bytes/1024:.1f}KB total")
|
374 |
-
|
375 |
-
yield speech.StreamingRecognizeRequest(audio_content=chunk)
|
376 |
-
|
377 |
-
except queue.Empty:
|
378 |
-
continue
|
379 |
-
except Exception as e:
|
380 |
-
log_error(f"β Error in request generator: {e}")
|
381 |
-
break
|
382 |
-
|
383 |
-
log_info(f"π Request generator finished. Total chunks: {chunk_count}, Total bytes: {total_bytes}")
|
384 |
-
|
385 |
# Create streaming client
|
386 |
-
requests = request_generator()
|
387 |
log_info("π€ Creating Google STT streaming client...")
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
408 |
|
409 |
-
#
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
-
|
420 |
-
|
421 |
-
|
422 |
-
|
423 |
-
|
424 |
-
|
425 |
-
continue
|
426 |
-
|
427 |
-
for result in response.results:
|
428 |
-
result_count += 1
|
429 |
-
|
430 |
-
if not result.alternatives:
|
431 |
-
continue
|
432 |
-
|
433 |
-
alternative = result.alternatives[0]
|
434 |
-
|
435 |
-
# Log all transcripts, even empty ones
|
436 |
-
log_debug(f"π Transcript: '{alternative.transcript}' (is_final: {result.is_final})")
|
437 |
-
|
438 |
-
if alternative.transcript.strip():
|
439 |
-
# Create transcription result
|
440 |
-
transcription = TranscriptionResult(
|
441 |
-
text=alternative.transcript,
|
442 |
-
is_final=result.is_final,
|
443 |
-
confidence=getattr(alternative, 'confidence', 0.0),
|
444 |
-
timestamp=datetime.now().timestamp()
|
445 |
-
)
|
446 |
-
|
447 |
-
# Put result in queue
|
448 |
-
self._put_result(transcription)
|
449 |
-
|
450 |
-
if result.is_final:
|
451 |
-
log_info(f"π― FINAL TRANSCRIPT: '{alternative.transcript}'")
|
452 |
-
|
453 |
-
# Single utterance modunda Google STT otomatik kapanΔ±r
|
454 |
-
if self.streaming_config.single_utterance:
|
455 |
-
log_info("β
Single utterance completed - Stream ending")
|
456 |
-
return
|
457 |
-
else:
|
458 |
-
log_debug(f"π Interim: '{alternative.transcript}'")
|
459 |
-
|
460 |
-
except StopIteration:
|
461 |
-
log_info("β
Google STT stream ended (StopIteration)")
|
462 |
-
except Exception as e:
|
463 |
-
log_error(f"β Error processing responses: {e}")
|
464 |
-
|
465 |
-
log_info(f"π Google STT stream ended. Responses: {response_count}, Results: {result_count}")
|
466 |
-
|
467 |
-
except Exception as e:
|
468 |
-
error_msg = str(e)
|
469 |
|
470 |
-
# Beklenen hatalar
|
471 |
-
if "iterating requests" in error_msg:
|
472 |
-
log_info("β
Stream ended normally")
|
473 |
-
elif "Exceeded maximum allowed stream duration" in error_msg:
|
474 |
-
log_warning("β οΈ Stream duration limit (5 min)")
|
475 |
-
elif "InvalidArgument" in error_msg:
|
476 |
-
log_error(f"β Invalid STT configuration: {error_msg}")
|
477 |
-
else:
|
478 |
-
log_error(f"β Google STT error: {error_msg}")
|
479 |
-
|
480 |
except Exception as e:
|
481 |
-
log_error(f"β
|
|
|
|
|
|
|
482 |
finally:
|
483 |
log_info("π€ Google STT stream thread ended")
|
484 |
-
self.
|
|
|
|
79 |
}
|
80 |
return encoding_map.get(encoding_str, speech.RecognitionConfig.AudioEncoding.WEBM_OPUS)
|
81 |
|
82 |
+
async def stream_audio(self, audio_chunk: bytes) -> AsyncGenerator[TranscriptionResult, None]:
|
83 |
+
"""Stream audio chunk and get results"""
|
84 |
if not self.is_streaming:
|
85 |
+
raise Exception("Streaming not started")
|
86 |
+
|
|
|
87 |
try:
|
88 |
+
# Audio validation and logging
|
89 |
+
chunk_size = len(audio_chunk)
|
90 |
+
|
91 |
+
# Log first chunk details
|
92 |
+
if self.chunk_count == 0:
|
93 |
+
log_info(f"π€ First chunk - size: {chunk_size} bytes")
|
94 |
+
# Check for WEBM header
|
95 |
+
if audio_chunk.startswith(b'\x1a\x45\xdf\xa3'):
|
96 |
+
log_info("β
Valid WEBM header detected")
|
97 |
+
else:
|
98 |
+
hex_preview = audio_chunk[:20].hex()
|
99 |
+
log_warning(f"β οΈ Unexpected audio format. First 20 bytes: {hex_preview}")
|
100 |
+
|
101 |
+
# Try to measure audio level (if it's raw PCM)
|
102 |
+
try:
|
103 |
+
import numpy as np
|
104 |
+
# This might fail for WEBM, but let's try
|
105 |
+
audio_array = np.frombuffer(audio_chunk[-1000:], dtype=np.int16) # Last 1000 bytes
|
106 |
+
if len(audio_array) > 0:
|
107 |
+
rms = np.sqrt(np.mean(audio_array.astype(float) ** 2))
|
108 |
+
db = 20 * np.log10(max(rms, 1) / 32768.0)
|
109 |
+
if self.chunk_count % 50 == 0:
|
110 |
+
log_info(f"π Audio level estimate: {db:.1f} dB")
|
111 |
+
except:
|
112 |
+
# Expected for WEBM format
|
113 |
+
pass
|
114 |
+
|
115 |
+
# Put chunk in queue
|
116 |
self.audio_queue.put(audio_chunk)
|
117 |
+
self.chunk_count += 1
|
118 |
+
self.total_bytes += chunk_size
|
119 |
+
|
120 |
+
# Log progress
|
121 |
+
if self.chunk_count % 50 == 0:
|
122 |
+
log_info(f"π€ Progress: {self.chunk_count} chunks, {self.total_bytes/1024:.1f}KB total")
|
123 |
+
|
124 |
+
# Check for responses with timeout
|
125 |
+
timeout = 0.1 # 100ms timeout for checking responses
|
126 |
+
end_time = time.time() + timeout
|
127 |
+
|
128 |
+
while time.time() < end_time:
|
129 |
try:
|
|
|
130 |
result = self.responses_queue.get_nowait()
|
131 |
+
log_info(f"π― Got result from queue: is_final={result.is_final}, text='{result.text[:30]}...'")
|
132 |
yield result
|
133 |
except queue.Empty:
|
134 |
+
# No results yet, continue
|
135 |
+
await asyncio.sleep(0.01)
|
136 |
+
except Exception as e:
|
137 |
+
log_error(f"Error getting result from queue: {e}")
|
138 |
break
|
139 |
+
|
140 |
except Exception as e:
|
141 |
+
log_error(f"β Error in stream_audio: {e}")
|
|
|
142 |
raise
|
143 |
|
144 |
async def stop_streaming(self) -> Optional[TranscriptionResult]:
|
|
|
380 |
log_error(f"β Error queuing result: {e}")
|
381 |
|
382 |
def _run_stream(self):
|
383 |
+
"""Run the streaming recognition loop in a separate thread"""
|
384 |
try:
|
385 |
+
log_info("π€ Google STT stream thread started - Single utterance mode: {}".format(self.single_utterance))
|
386 |
+
|
387 |
+
# Create request generator
|
388 |
+
requests = self._request_generator()
|
389 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
390 |
# Create streaming client
|
|
|
391 |
log_info("π€ Creating Google STT streaming client...")
|
392 |
+
responses = self.client.streaming_recognize(self.streaming_config, requests)
|
393 |
+
|
394 |
+
# Track if we've received any response
|
395 |
+
first_response_time = None
|
396 |
+
response_count = 0
|
397 |
+
|
398 |
+
# Process responses with detailed logging
|
399 |
+
for response in responses:
|
400 |
+
response_count += 1
|
401 |
+
|
402 |
+
if first_response_time is None:
|
403 |
+
first_response_time = time.time()
|
404 |
+
elapsed = first_response_time - self.stream_start_time
|
405 |
+
log_info(f"π FIRST RESPONSE from Google STT after {elapsed:.2f}s")
|
406 |
+
|
407 |
+
# Log every response, even if empty
|
408 |
+
log_info(f"π¨ Google STT Response #{response_count}: has_results={len(response.results) > 0}")
|
409 |
+
|
410 |
+
if not response.results:
|
411 |
+
log_info("π Empty response from Google STT (no results)")
|
412 |
+
continue
|
413 |
+
|
414 |
+
# Log all results in detail
|
415 |
+
for result_idx, result in enumerate(response.results):
|
416 |
+
log_info(f"π Result #{result_idx}: is_final={result.is_final}, "
|
417 |
+
f"alternatives={len(result.alternatives)}, "
|
418 |
+
f"stability={getattr(result, 'stability', 'N/A')}")
|
419 |
+
|
420 |
+
if result.alternatives:
|
421 |
+
best_alternative = result.alternatives[0]
|
422 |
+
log_info(f"π£οΈ Transcript: '{best_alternative.transcript}' "
|
423 |
+
f"(confidence: {best_alternative.confidence:.3f})")
|
424 |
|
425 |
+
# Put result in queue
|
426 |
+
result_obj = TranscriptionResult(
|
427 |
+
text=best_alternative.transcript,
|
428 |
+
is_final=result.is_final,
|
429 |
+
confidence=best_alternative.confidence,
|
430 |
+
timestamp=datetime.utcnow()
|
431 |
+
)
|
432 |
+
|
433 |
+
self.responses_queue.put(result_obj)
|
434 |
+
log_info(f"β
Result queued: is_final={result.is_final}, text='{best_alternative.transcript[:50]}...'")
|
435 |
+
|
436 |
+
# Log if we exit without any responses
|
437 |
+
if response_count == 0:
|
438 |
+
log_error("β Google STT stream ended without ANY responses!")
|
439 |
+
else:
|
440 |
+
log_info(f"β
Google STT stream ended normally after {response_count} responses")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
441 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
442 |
except Exception as e:
|
443 |
+
log_error(f"β Google STT error: {e}")
|
444 |
+
if hasattr(e, 'details'):
|
445 |
+
log_error(f"Error details: {e.details}")
|
446 |
+
self.error_message = str(e)
|
447 |
finally:
|
448 |
log_info("π€ Google STT stream thread ended")
|
449 |
+
with self.lock:
|
450 |
+
self.is_streaming = False
|