Spaces:
Building
Building
Update stt/stt_google.py
Browse files- stt/stt_google.py +66 -154
stt/stt_google.py
CHANGED
@@ -4,7 +4,7 @@ Google Cloud Speech-to-Text Implementation
|
|
4 |
import os
|
5 |
import asyncio
|
6 |
from typing import AsyncIterator, AsyncGenerator, Optional, List, Any
|
7 |
-
import numpy as np
|
8 |
from datetime import datetime
|
9 |
import sys
|
10 |
import queue
|
@@ -43,16 +43,15 @@ class GoogleCloudSTT(STTInterface):
|
|
43 |
self.session_id = 0
|
44 |
self.stream_start_time = None
|
45 |
|
46 |
-
#
|
47 |
-
self.lock = threading.Lock()
|
48 |
-
self.single_utterance = False
|
49 |
-
self.chunk_count = 0
|
50 |
-
self.total_bytes = 0
|
51 |
-
self.stop_event = threading.Event()
|
52 |
|
53 |
# Set Google credentials
|
54 |
if credentials_path:
|
55 |
-
# ConfigProvider'dan gelen path'i kullan
|
56 |
if os.path.exists(credentials_path):
|
57 |
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials_path
|
58 |
log_info(f"β
Google credentials set from: {credentials_path}")
|
@@ -63,14 +62,12 @@ class GoogleCloudSTT(STTInterface):
|
|
63 |
# Fallback to environment variable
|
64 |
creds_path = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS")
|
65 |
if not creds_path:
|
66 |
-
# Try default location
|
67 |
creds_path = "./credentials/google-service-account.json"
|
68 |
if os.path.exists(creds_path):
|
69 |
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = creds_path
|
70 |
log_info(f"β
Google credentials set from default: {creds_path}")
|
71 |
else:
|
72 |
raise ValueError("Google credentials not found. Please provide credentials_path")
|
73 |
-
|
74 |
|
75 |
# Test credentials
|
76 |
try:
|
@@ -95,19 +92,20 @@ class GoogleCloudSTT(STTInterface):
|
|
95 |
}
|
96 |
return encoding_map.get(encoding_str, speech.RecognitionConfig.AudioEncoding.WEBM_OPUS)
|
97 |
|
|
|
|
|
|
|
98 |
async def stream_audio(self, audio_chunk: bytes) -> AsyncGenerator[TranscriptionResult, None]:
|
99 |
"""Stream audio chunk and get results"""
|
100 |
if not self.is_streaming:
|
101 |
raise Exception("Streaming not started")
|
102 |
|
103 |
try:
|
104 |
-
# Audio validation and logging
|
105 |
chunk_size = len(audio_chunk)
|
106 |
|
107 |
# Log first chunk details
|
108 |
if self.chunk_count == 0:
|
109 |
log_info(f"π€ First chunk - size: {chunk_size} bytes")
|
110 |
-
# Check for WEBM header
|
111 |
if audio_chunk.startswith(b'\x1a\x45\xdf\xa3'):
|
112 |
log_info("β
Valid WEBM header detected")
|
113 |
else:
|
@@ -116,16 +114,14 @@ class GoogleCloudSTT(STTInterface):
|
|
116 |
|
117 |
# Try to measure audio level (if it's raw PCM)
|
118 |
try:
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
log_info(f"π Audio level estimate: {db:.1f} dB")
|
127 |
except:
|
128 |
-
# Expected for WEBM format
|
129 |
pass
|
130 |
|
131 |
# Put chunk in queue
|
@@ -137,8 +133,8 @@ class GoogleCloudSTT(STTInterface):
|
|
137 |
if self.chunk_count % 50 == 0:
|
138 |
log_info(f"π€ Progress: {self.chunk_count} chunks, {self.total_bytes/1024:.1f}KB total")
|
139 |
|
140 |
-
# Check for responses
|
141 |
-
timeout = 0.1
|
142 |
end_time = time.time() + timeout
|
143 |
|
144 |
while time.time() < end_time:
|
@@ -147,7 +143,6 @@ class GoogleCloudSTT(STTInterface):
|
|
147 |
log_info(f"π― Got result from queue: is_final={result.is_final}, text='{result.text[:30]}...'")
|
148 |
yield result
|
149 |
except queue.Empty:
|
150 |
-
# No results yet, continue
|
151 |
await asyncio.sleep(0.01)
|
152 |
except Exception as e:
|
153 |
log_error(f"Error getting result from queue: {e}")
|
@@ -166,18 +161,19 @@ class GoogleCloudSTT(STTInterface):
|
|
166 |
try:
|
167 |
log_info(f"π Stopping Google STT streaming session #{self.session_id}")
|
168 |
|
169 |
-
#
|
170 |
self.is_streaming = False
|
|
|
171 |
self.stop_event.set()
|
172 |
|
173 |
-
# Send poison pill
|
174 |
if self.audio_queue:
|
175 |
try:
|
176 |
self.audio_queue.put(None)
|
177 |
except:
|
178 |
pass
|
179 |
|
180 |
-
#
|
181 |
if self.stream_thread and self.stream_thread.is_alive():
|
182 |
log_info("β³ Waiting for stream thread to finish...")
|
183 |
self.stream_thread.join(timeout=5.0)
|
@@ -187,18 +183,18 @@ class GoogleCloudSTT(STTInterface):
|
|
187 |
else:
|
188 |
log_info("β
Stream thread finished")
|
189 |
|
190 |
-
#
|
191 |
final_result = None
|
192 |
if self.responses_queue:
|
193 |
while not self.responses_queue.empty():
|
194 |
try:
|
195 |
-
result = self.responses_queue.get_nowait()
|
196 |
if result.is_final:
|
197 |
final_result = result
|
198 |
-
except queue.Empty:
|
199 |
break
|
200 |
|
201 |
-
#
|
202 |
if self.client:
|
203 |
try:
|
204 |
if hasattr(self.client, 'transport') and hasattr(self.client.transport, 'close'):
|
@@ -213,11 +209,9 @@ class GoogleCloudSTT(STTInterface):
|
|
213 |
finally:
|
214 |
self.client = None
|
215 |
|
216 |
-
#
|
217 |
self.audio_queue = None
|
218 |
self.responses_queue = None
|
219 |
-
|
220 |
-
# DiΔer deΔiΕkenleri resetle
|
221 |
self.stream_thread = None
|
222 |
self.streaming_config = None
|
223 |
self.stop_event.clear()
|
@@ -227,7 +221,6 @@ class GoogleCloudSTT(STTInterface):
|
|
227 |
|
228 |
except Exception as e:
|
229 |
log_error(f"β Error during stop_streaming", error=str(e))
|
230 |
-
# Force cleanup on error
|
231 |
self.is_streaming = False
|
232 |
self.stream_thread = None
|
233 |
self.client = None
|
@@ -244,19 +237,8 @@ class GoogleCloudSTT(STTInterface):
|
|
244 |
def get_supported_languages(self) -> List[str]:
|
245 |
"""Get list of supported language codes"""
|
246 |
return [
|
247 |
-
"tr-TR",
|
248 |
-
"
|
249 |
-
"en-GB", # English (UK)
|
250 |
-
"de-DE", # German
|
251 |
-
"fr-FR", # French
|
252 |
-
"es-ES", # Spanish
|
253 |
-
"it-IT", # Italian
|
254 |
-
"pt-BR", # Portuguese (Brazil)
|
255 |
-
"ru-RU", # Russian
|
256 |
-
"ja-JP", # Japanese
|
257 |
-
"ko-KR", # Korean
|
258 |
-
"zh-CN", # Chinese (Simplified)
|
259 |
-
"ar-SA", # Arabic
|
260 |
]
|
261 |
|
262 |
def get_provider_name(self) -> str:
|
@@ -283,21 +265,18 @@ class GoogleCloudSTT(STTInterface):
|
|
283 |
self.error_message = None
|
284 |
self.session_id += 1
|
285 |
self.stream_start_time = time.time()
|
286 |
-
|
287 |
-
# β
Counter'larΔ± sΔ±fΔ±rla
|
288 |
self.chunk_count = 0
|
289 |
self.total_bytes = 0
|
290 |
|
291 |
log_info(f"π Google STT session data reset. New session ID: {self.session_id}")
|
292 |
|
293 |
-
# Create fresh queues
|
294 |
self.audio_queue = queue.Queue()
|
295 |
self.responses_queue = queue.Queue()
|
296 |
log_debug("β
Created fresh queues")
|
297 |
|
298 |
def _create_fresh_queues(self):
|
299 |
"""Create fresh queue instances"""
|
300 |
-
# Eski queue'larΔ± temizle
|
301 |
if self.audio_queue:
|
302 |
while not self.audio_queue.empty():
|
303 |
try:
|
@@ -312,35 +291,27 @@ class GoogleCloudSTT(STTInterface):
|
|
312 |
except:
|
313 |
pass
|
314 |
|
315 |
-
|
316 |
-
self.audio_queue = queue.Queue(maxsize=1000) # Max size ekle
|
317 |
self.responses_queue = queue.Queue(maxsize=100)
|
318 |
log_debug("β
Created fresh queues")
|
319 |
|
320 |
def _request_generator(self):
|
321 |
"""Generate requests for the streaming recognize API"""
|
322 |
-
# First request
|
323 |
yield speech.StreamingRecognizeRequest(streaming_config=self.streaming_config)
|
324 |
|
325 |
-
#
|
326 |
while not self.should_stop:
|
327 |
try:
|
328 |
-
# Get audio chunk from queue with timeout
|
329 |
audio_chunk = self.audio_queue.get(timeout=0.1)
|
330 |
|
331 |
if audio_chunk is None:
|
332 |
-
# Poison pill received
|
333 |
log_info("π Poison pill received, stopping request generator")
|
334 |
break
|
335 |
|
336 |
-
# Send audio chunk
|
337 |
yield speech.StreamingRecognizeRequest(audio_content=audio_chunk)
|
338 |
|
339 |
-
self.chunk_count += 1
|
340 |
-
self.total_bytes += len(audio_chunk)
|
341 |
-
|
342 |
except queue.Empty:
|
343 |
-
# No audio available, continue waiting
|
344 |
continue
|
345 |
except Exception as e:
|
346 |
log_error(f"Error in request generator: {e}")
|
@@ -348,30 +319,27 @@ class GoogleCloudSTT(STTInterface):
|
|
348 |
|
349 |
log_info(f"π Request generator finished. Total chunks: {self.chunk_count}, Total bytes: {self.total_bytes}")
|
350 |
|
351 |
-
async def start_streaming(self, config:
|
352 |
"""Initialize streaming session with clean state"""
|
353 |
try:
|
354 |
-
#
|
355 |
if self.is_streaming or self.stream_thread:
|
356 |
log_warning("β οΈ Previous stream still active, stopping it first")
|
357 |
await self.stop_streaming()
|
358 |
-
# Temizlik iΓ§in bekle
|
359 |
await asyncio.sleep(0.5)
|
360 |
|
361 |
-
#
|
362 |
self._reset_session()
|
363 |
-
|
364 |
self.single_utterance = config.single_utterance
|
365 |
|
366 |
log_info(f"π€ Starting Google STT streaming session #{self.session_id} with config: {config}")
|
367 |
|
368 |
-
#
|
369 |
self._create_fresh_queues()
|
370 |
-
|
371 |
-
# Stop event'i temizle
|
372 |
self.stop_event.clear()
|
|
|
373 |
|
374 |
-
#
|
375 |
self.client = speech.SpeechClient()
|
376 |
log_info("β
Created new Google Speech client")
|
377 |
|
@@ -384,20 +352,19 @@ class GoogleCloudSTT(STTInterface):
|
|
384 |
model=config.model,
|
385 |
use_enhanced=config.use_enhanced,
|
386 |
max_alternatives=1,
|
387 |
-
# Metadata for better recognition
|
388 |
metadata=speech.RecognitionMetadata(
|
389 |
interaction_type=speech.RecognitionMetadata.InteractionType.VOICE_SEARCH,
|
390 |
microphone_distance=speech.RecognitionMetadata.MicrophoneDistance.NEARFIELD,
|
391 |
recording_device_type=speech.RecognitionMetadata.RecordingDeviceType.PC,
|
392 |
)
|
393 |
-
)
|
394 |
|
395 |
# Create streaming config with VAD
|
396 |
self.streaming_config = speech.StreamingRecognitionConfig(
|
397 |
config=recognition_config,
|
398 |
-
interim_results=config.interim_results,
|
399 |
-
single_utterance=config.single_utterance,
|
400 |
-
enable_voice_activity_events=True # β
VAD events
|
401 |
)
|
402 |
|
403 |
log_info(f"π Streaming config created: interim_results={config.interim_results}, "
|
@@ -406,12 +373,12 @@ class GoogleCloudSTT(STTInterface):
|
|
406 |
|
407 |
self.is_streaming = True
|
408 |
|
409 |
-
# Start streaming thread
|
410 |
self.stream_thread = threading.Thread(
|
411 |
target=self._run_stream,
|
412 |
name=f"GoogleSTT-Session-{self.session_id}"
|
413 |
)
|
414 |
-
self.stream_thread.daemon = True
|
415 |
self.stream_thread.start()
|
416 |
|
417 |
log_info(f"β
Google STT streaming session #{self.session_id} started successfully")
|
@@ -420,55 +387,33 @@ class GoogleCloudSTT(STTInterface):
|
|
420 |
log_error(f"β Failed to start Google STT streaming", error=str(e))
|
421 |
self.is_streaming = False
|
422 |
self.client = None
|
423 |
-
self._create_fresh_queues()
|
424 |
raise
|
425 |
|
426 |
-
def _put_result(self, result: TranscriptionResult):
|
427 |
-
"""Helper to put result in queue"""
|
428 |
-
try:
|
429 |
-
self.responses_queue.put(result)
|
430 |
-
# Debug log'u kaldΔ±rdΔ±k
|
431 |
-
except Exception as e:
|
432 |
-
log_error(f"β Error queuing result: {e}")
|
433 |
-
|
434 |
def _run_stream(self):
|
435 |
"""Run the streaming recognition loop in a separate thread"""
|
436 |
try:
|
437 |
-
log_info("π€ Google STT stream thread started - Single utterance mode: {
|
438 |
|
439 |
# Create request generator
|
440 |
requests = self._request_generator()
|
441 |
|
442 |
# Create streaming client
|
443 |
log_info("π€ Creating Google STT streaming client...")
|
444 |
-
|
445 |
-
# Set a timeout for the streaming call
|
446 |
-
import grpc
|
447 |
-
timeout = 300 # 5 minutes max for the stream
|
448 |
-
|
449 |
-
# Create streaming client with timeout
|
450 |
-
responses = self.client.streaming_recognize(
|
451 |
-
self.streaming_config,
|
452 |
-
requests,
|
453 |
-
timeout=timeout
|
454 |
-
)
|
455 |
-
|
456 |
-
# Set initial response timeout
|
457 |
-
initial_response_timeout = 30 # 30 seconds to get first response
|
458 |
-
stream_start = time.time()
|
459 |
-
got_first_response = False
|
460 |
|
461 |
-
#
|
|
|
|
|
|
|
462 |
first_response_time = None
|
463 |
response_count = 0
|
464 |
|
465 |
-
# Process responses
|
466 |
for response in responses:
|
467 |
-
if
|
468 |
-
|
469 |
-
|
470 |
-
|
471 |
-
|
472 |
response_count += 1
|
473 |
|
474 |
if first_response_time is None:
|
@@ -476,47 +421,14 @@ class GoogleCloudSTT(STTInterface):
|
|
476 |
elapsed = first_response_time - self.stream_start_time
|
477 |
log_info(f"π FIRST RESPONSE from Google STT after {elapsed:.2f}s")
|
478 |
|
479 |
-
#
|
480 |
-
|
481 |
-
|
482 |
-
|
483 |
-
log_info("π Empty response from Google STT (no results)")
|
484 |
-
continue
|
485 |
-
|
486 |
-
# Log all results in detail
|
487 |
-
for result_idx, result in enumerate(response.results):
|
488 |
-
log_info(f"π Result #{result_idx}: is_final={result.is_final}, "
|
489 |
-
f"alternatives={len(result.alternatives)}, "
|
490 |
-
f"stability={getattr(result, 'stability', 'N/A')}")
|
491 |
|
492 |
-
if
|
493 |
-
|
494 |
-
log_info(f"π£οΈ Transcript: '{best_alternative.transcript}' "
|
495 |
-
f"(confidence: {best_alternative.confidence:.3f})")
|
496 |
-
|
497 |
-
# Put result in queue
|
498 |
-
result_obj = TranscriptionResult(
|
499 |
-
text=best_alternative.transcript,
|
500 |
-
is_final=result.is_final,
|
501 |
-
confidence=best_alternative.confidence,
|
502 |
-
timestamp=datetime.utcnow()
|
503 |
-
)
|
504 |
-
|
505 |
-
self.responses_queue.put(result_obj)
|
506 |
-
log_info(f"β
Result queued: is_final={result.is_final}, text='{best_alternative.transcript[:50]}...'")
|
507 |
-
|
508 |
-
# Log if we exit without any responses
|
509 |
-
if response_count == 0:
|
510 |
-
log_error("β Google STT stream ended without ANY responses!")
|
511 |
-
else:
|
512 |
-
log_info(f"β
Google STT stream ended normally after {response_count} responses")
|
513 |
|
514 |
-
|
515 |
-
|
516 |
-
|
517 |
-
log_error(f"Error details: {e.details}")
|
518 |
-
self.error_message = str(e)
|
519 |
-
finally:
|
520 |
-
log_info("π€ Google STT stream thread ended")
|
521 |
-
with self.lock:
|
522 |
-
self.is_streaming = False
|
|
|
4 |
import os
|
5 |
import asyncio
|
6 |
from typing import AsyncIterator, AsyncGenerator, Optional, List, Any
|
7 |
+
import numpy as np
|
8 |
from datetime import datetime
|
9 |
import sys
|
10 |
import queue
|
|
|
43 |
self.session_id = 0
|
44 |
self.stream_start_time = None
|
45 |
|
46 |
+
# Additional attributes
|
47 |
+
self.lock = threading.Lock()
|
48 |
+
self.single_utterance = False
|
49 |
+
self.chunk_count = 0
|
50 |
+
self.total_bytes = 0
|
51 |
+
self.stop_event = threading.Event()
|
52 |
|
53 |
# Set Google credentials
|
54 |
if credentials_path:
|
|
|
55 |
if os.path.exists(credentials_path):
|
56 |
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials_path
|
57 |
log_info(f"β
Google credentials set from: {credentials_path}")
|
|
|
62 |
# Fallback to environment variable
|
63 |
creds_path = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS")
|
64 |
if not creds_path:
|
|
|
65 |
creds_path = "./credentials/google-service-account.json"
|
66 |
if os.path.exists(creds_path):
|
67 |
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = creds_path
|
68 |
log_info(f"β
Google credentials set from default: {creds_path}")
|
69 |
else:
|
70 |
raise ValueError("Google credentials not found. Please provide credentials_path")
|
|
|
71 |
|
72 |
# Test credentials
|
73 |
try:
|
|
|
92 |
}
|
93 |
return encoding_map.get(encoding_str, speech.RecognitionConfig.AudioEncoding.WEBM_OPUS)
|
94 |
|
95 |
+
# Alias for compatibility
|
96 |
+
_get_google_encoding = _get_encoding
|
97 |
+
|
98 |
async def stream_audio(self, audio_chunk: bytes) -> AsyncGenerator[TranscriptionResult, None]:
|
99 |
"""Stream audio chunk and get results"""
|
100 |
if not self.is_streaming:
|
101 |
raise Exception("Streaming not started")
|
102 |
|
103 |
try:
|
|
|
104 |
chunk_size = len(audio_chunk)
|
105 |
|
106 |
# Log first chunk details
|
107 |
if self.chunk_count == 0:
|
108 |
log_info(f"π€ First chunk - size: {chunk_size} bytes")
|
|
|
109 |
if audio_chunk.startswith(b'\x1a\x45\xdf\xa3'):
|
110 |
log_info("β
Valid WEBM header detected")
|
111 |
else:
|
|
|
114 |
|
115 |
# Try to measure audio level (if it's raw PCM)
|
116 |
try:
|
117 |
+
if encoding_str == "LINEAR16": # Only for raw PCM
|
118 |
+
audio_array = np.frombuffer(audio_chunk, dtype=np.int16)
|
119 |
+
if len(audio_array) > 0:
|
120 |
+
rms = np.sqrt(np.mean(audio_array.astype(float) ** 2))
|
121 |
+
db = 20 * np.log10(max(rms, 1) / 32768.0)
|
122 |
+
if self.chunk_count % 50 == 0:
|
123 |
+
log_info(f"π Audio level: {db:.1f} dB")
|
|
|
124 |
except:
|
|
|
125 |
pass
|
126 |
|
127 |
# Put chunk in queue
|
|
|
133 |
if self.chunk_count % 50 == 0:
|
134 |
log_info(f"π€ Progress: {self.chunk_count} chunks, {self.total_bytes/1024:.1f}KB total")
|
135 |
|
136 |
+
# Check for responses
|
137 |
+
timeout = 0.1
|
138 |
end_time = time.time() + timeout
|
139 |
|
140 |
while time.time() < end_time:
|
|
|
143 |
log_info(f"π― Got result from queue: is_final={result.is_final}, text='{result.text[:30]}...'")
|
144 |
yield result
|
145 |
except queue.Empty:
|
|
|
146 |
await asyncio.sleep(0.01)
|
147 |
except Exception as e:
|
148 |
log_error(f"Error getting result from queue: {e}")
|
|
|
161 |
try:
|
162 |
log_info(f"π Stopping Google STT streaming session #{self.session_id}")
|
163 |
|
164 |
+
# Set flags
|
165 |
self.is_streaming = False
|
166 |
+
self.should_stop = True
|
167 |
self.stop_event.set()
|
168 |
|
169 |
+
# Send poison pill
|
170 |
if self.audio_queue:
|
171 |
try:
|
172 |
self.audio_queue.put(None)
|
173 |
except:
|
174 |
pass
|
175 |
|
176 |
+
# Wait for thread
|
177 |
if self.stream_thread and self.stream_thread.is_alive():
|
178 |
log_info("β³ Waiting for stream thread to finish...")
|
179 |
self.stream_thread.join(timeout=5.0)
|
|
|
183 |
else:
|
184 |
log_info("β
Stream thread finished")
|
185 |
|
186 |
+
# Get final result
|
187 |
final_result = None
|
188 |
if self.responses_queue:
|
189 |
while not self.responses_queue.empty():
|
190 |
try:
|
191 |
+
result = self.responses_queue.get_nowait()
|
192 |
if result.is_final:
|
193 |
final_result = result
|
194 |
+
except queue.Empty:
|
195 |
break
|
196 |
|
197 |
+
# Close client
|
198 |
if self.client:
|
199 |
try:
|
200 |
if hasattr(self.client, 'transport') and hasattr(self.client.transport, 'close'):
|
|
|
209 |
finally:
|
210 |
self.client = None
|
211 |
|
212 |
+
# Reset state
|
213 |
self.audio_queue = None
|
214 |
self.responses_queue = None
|
|
|
|
|
215 |
self.stream_thread = None
|
216 |
self.streaming_config = None
|
217 |
self.stop_event.clear()
|
|
|
221 |
|
222 |
except Exception as e:
|
223 |
log_error(f"β Error during stop_streaming", error=str(e))
|
|
|
224 |
self.is_streaming = False
|
225 |
self.stream_thread = None
|
226 |
self.client = None
|
|
|
237 |
def get_supported_languages(self) -> List[str]:
|
238 |
"""Get list of supported language codes"""
|
239 |
return [
|
240 |
+
"tr-TR", "en-US", "en-GB", "de-DE", "fr-FR", "es-ES",
|
241 |
+
"it-IT", "pt-BR", "ru-RU", "ja-JP", "ko-KR", "zh-CN", "ar-SA"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
242 |
]
|
243 |
|
244 |
def get_provider_name(self) -> str:
|
|
|
265 |
self.error_message = None
|
266 |
self.session_id += 1
|
267 |
self.stream_start_time = time.time()
|
|
|
|
|
268 |
self.chunk_count = 0
|
269 |
self.total_bytes = 0
|
270 |
|
271 |
log_info(f"π Google STT session data reset. New session ID: {self.session_id}")
|
272 |
|
273 |
+
# Create fresh queues
|
274 |
self.audio_queue = queue.Queue()
|
275 |
self.responses_queue = queue.Queue()
|
276 |
log_debug("β
Created fresh queues")
|
277 |
|
278 |
def _create_fresh_queues(self):
|
279 |
"""Create fresh queue instances"""
|
|
|
280 |
if self.audio_queue:
|
281 |
while not self.audio_queue.empty():
|
282 |
try:
|
|
|
291 |
except:
|
292 |
pass
|
293 |
|
294 |
+
self.audio_queue = queue.Queue(maxsize=1000)
|
|
|
295 |
self.responses_queue = queue.Queue(maxsize=100)
|
296 |
log_debug("β
Created fresh queues")
|
297 |
|
298 |
def _request_generator(self):
|
299 |
"""Generate requests for the streaming recognize API"""
|
300 |
+
# First request with config
|
301 |
yield speech.StreamingRecognizeRequest(streaming_config=self.streaming_config)
|
302 |
|
303 |
+
# Audio chunks
|
304 |
while not self.should_stop:
|
305 |
try:
|
|
|
306 |
audio_chunk = self.audio_queue.get(timeout=0.1)
|
307 |
|
308 |
if audio_chunk is None:
|
|
|
309 |
log_info("π Poison pill received, stopping request generator")
|
310 |
break
|
311 |
|
|
|
312 |
yield speech.StreamingRecognizeRequest(audio_content=audio_chunk)
|
313 |
|
|
|
|
|
|
|
314 |
except queue.Empty:
|
|
|
315 |
continue
|
316 |
except Exception as e:
|
317 |
log_error(f"Error in request generator: {e}")
|
|
|
319 |
|
320 |
log_info(f"π Request generator finished. Total chunks: {self.chunk_count}, Total bytes: {self.total_bytes}")
|
321 |
|
322 |
+
async def start_streaming(self, config: STTConfig) -> None:
|
323 |
"""Initialize streaming session with clean state"""
|
324 |
try:
|
325 |
+
# Clean up any existing stream
|
326 |
if self.is_streaming or self.stream_thread:
|
327 |
log_warning("β οΈ Previous stream still active, stopping it first")
|
328 |
await self.stop_streaming()
|
|
|
329 |
await asyncio.sleep(0.5)
|
330 |
|
331 |
+
# Reset session
|
332 |
self._reset_session()
|
|
|
333 |
self.single_utterance = config.single_utterance
|
334 |
|
335 |
log_info(f"π€ Starting Google STT streaming session #{self.session_id} with config: {config}")
|
336 |
|
337 |
+
# Create fresh queues
|
338 |
self._create_fresh_queues()
|
|
|
|
|
339 |
self.stop_event.clear()
|
340 |
+
self.should_stop = False
|
341 |
|
342 |
+
# Create new client
|
343 |
self.client = speech.SpeechClient()
|
344 |
log_info("β
Created new Google Speech client")
|
345 |
|
|
|
352 |
model=config.model,
|
353 |
use_enhanced=config.use_enhanced,
|
354 |
max_alternatives=1,
|
|
|
355 |
metadata=speech.RecognitionMetadata(
|
356 |
interaction_type=speech.RecognitionMetadata.InteractionType.VOICE_SEARCH,
|
357 |
microphone_distance=speech.RecognitionMetadata.MicrophoneDistance.NEARFIELD,
|
358 |
recording_device_type=speech.RecognitionMetadata.RecordingDeviceType.PC,
|
359 |
)
|
360 |
+
)
|
361 |
|
362 |
# Create streaming config with VAD
|
363 |
self.streaming_config = speech.StreamingRecognitionConfig(
|
364 |
config=recognition_config,
|
365 |
+
interim_results=config.interim_results,
|
366 |
+
single_utterance=config.single_utterance,
|
367 |
+
enable_voice_activity_events=True # β
VAD events enabled
|
368 |
)
|
369 |
|
370 |
log_info(f"π Streaming config created: interim_results={config.interim_results}, "
|
|
|
373 |
|
374 |
self.is_streaming = True
|
375 |
|
376 |
+
# Start streaming thread
|
377 |
self.stream_thread = threading.Thread(
|
378 |
target=self._run_stream,
|
379 |
name=f"GoogleSTT-Session-{self.session_id}"
|
380 |
)
|
381 |
+
self.stream_thread.daemon = True
|
382 |
self.stream_thread.start()
|
383 |
|
384 |
log_info(f"β
Google STT streaming session #{self.session_id} started successfully")
|
|
|
387 |
log_error(f"β Failed to start Google STT streaming", error=str(e))
|
388 |
self.is_streaming = False
|
389 |
self.client = None
|
390 |
+
self._create_fresh_queues()
|
391 |
raise
|
392 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
393 |
def _run_stream(self):
|
394 |
"""Run the streaming recognition loop in a separate thread"""
|
395 |
try:
|
396 |
+
log_info(f"π€ Google STT stream thread started - Single utterance mode: {self.single_utterance}")
|
397 |
|
398 |
# Create request generator
|
399 |
requests = self._request_generator()
|
400 |
|
401 |
# Create streaming client
|
402 |
log_info("π€ Creating Google STT streaming client...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
403 |
|
404 |
+
# Get responses (no timeout parameter!)
|
405 |
+
responses = self.client.streaming_recognize(self.streaming_config, requests)
|
406 |
+
|
407 |
+
# Track responses
|
408 |
first_response_time = None
|
409 |
response_count = 0
|
410 |
|
411 |
+
# Process responses
|
412 |
for response in responses:
|
413 |
+
if self.should_stop:
|
414 |
+
log_info("π Stop flag detected, ending stream")
|
415 |
+
break
|
416 |
+
|
|
|
417 |
response_count += 1
|
418 |
|
419 |
if first_response_time is None:
|
|
|
421 |
elapsed = first_response_time - self.stream_start_time
|
422 |
log_info(f"π FIRST RESPONSE from Google STT after {elapsed:.2f}s")
|
423 |
|
424 |
+
# Check for VAD events
|
425 |
+
if hasattr(response, 'speech_event_type') and response.speech_event_type:
|
426 |
+
event_type = response.speech_event_type
|
427 |
+
log_info(f"ποΈ VAD Event: {event_type}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
428 |
|
429 |
+
if event_type == speech.StreamingRecognizeResponse.SpeechEventType.END_OF_SINGLE_UTTERANCE:
|
430 |
+
log_info("π End of utterance detected by VAD")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
431 |
|
432 |
+
# Log response
|
433 |
+
has_results = len(response.results) > 0 if hasattr(response, 'results') else False
|
434 |
+
log_info(f"π¨ Google STT Response #{response_count}: has_results={has_results}")
|
|
|
|
|
|
|
|
|
|
|
|