ciyidogan commited on
Commit
78b5a88
Β·
verified Β·
1 Parent(s): a4bca86

Update stt/stt_google.py

Browse files
Files changed (1) hide show
  1. stt/stt_google.py +364 -503
stt/stt_google.py CHANGED
@@ -1,503 +1,364 @@
1
- """
2
- Google Cloud Speech-to-Text Implementation
3
- """
4
- import os
5
- import asyncio
6
- from typing import AsyncIterator, AsyncGenerator, Optional, List, Any
7
- import numpy as np
8
- from datetime import datetime
9
- import sys
10
- import queue
11
- import threading
12
- import time
13
- import traceback
14
- from utils.logger import log_info, log_error, log_debug, log_warning
15
-
16
- # Import Google Cloud Speech only if available
17
- try:
18
- from google.cloud import speech
19
- from google.api_core import exceptions
20
- GOOGLE_SPEECH_AVAILABLE = True
21
- except ImportError:
22
- GOOGLE_SPEECH_AVAILABLE = False
23
- log_info("⚠️ Google Cloud Speech library not installed")
24
-
25
- from .stt_interface import STTInterface, STTConfig, TranscriptionResult
26
-
27
- class GoogleCloudSTT(STTInterface):
28
- """Google Cloud Speech-to-Text implementation"""
29
-
30
- def __init__(self, credentials_path: Optional[str] = None):
31
- """Initialize Google Cloud STT"""
32
- log_info("🎀 Creating STT provider: google")
33
-
34
- # Initialize all required attributes
35
- self.client = None
36
- self.streaming_config = None
37
- self.stream_thread = None
38
- self.audio_queue = queue.Queue()
39
- self.responses_queue = queue.Queue()
40
- self.is_streaming = False
41
- self.should_stop = False
42
- self.error_message = None
43
- self.session_id = 0
44
- self.stream_start_time = None
45
-
46
- # Additional attributes
47
- self.lock = threading.Lock()
48
- self.single_utterance = False
49
- self.chunk_count = 0
50
- self.total_bytes = 0
51
- self.stop_event = threading.Event()
52
-
53
- # Set Google credentials
54
- if credentials_path:
55
- if os.path.exists(credentials_path):
56
- os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials_path
57
- log_info(f"βœ… Google credentials set from: {credentials_path}")
58
- else:
59
- log_error(f"❌ Credentials file not found: {credentials_path}")
60
- raise ValueError(f"Google credentials file not found: {credentials_path}")
61
- else:
62
- # Fallback to environment variable
63
- creds_path = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS")
64
- if not creds_path:
65
- creds_path = "./credentials/google-service-account.json"
66
- if os.path.exists(creds_path):
67
- os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = creds_path
68
- log_info(f"βœ… Google credentials set from default: {creds_path}")
69
- else:
70
- raise ValueError("Google credentials not found. Please provide credentials_path")
71
-
72
- # Test credentials
73
- try:
74
- log_info("πŸ” Testing Google credentials...")
75
- test_client = speech.SpeechClient()
76
- log_info("βœ… Google credentials valid")
77
- except Exception as e:
78
- log_error(f"❌ Invalid Google credentials: {e}")
79
- raise
80
-
81
- def _get_encoding(self, encoding_str: str):
82
- """Convert encoding string to Google Speech enum"""
83
- if not GOOGLE_SPEECH_AVAILABLE:
84
- return None
85
-
86
- encoding_map = {
87
- "WEBM_OPUS": speech.RecognitionConfig.AudioEncoding.WEBM_OPUS,
88
- "LINEAR16": speech.RecognitionConfig.AudioEncoding.LINEAR16,
89
- "FLAC": speech.RecognitionConfig.AudioEncoding.FLAC,
90
- "MP3": speech.RecognitionConfig.AudioEncoding.MP3,
91
- "OGG_OPUS": speech.RecognitionConfig.AudioEncoding.OGG_OPUS,
92
- }
93
- return encoding_map.get(encoding_str, speech.RecognitionConfig.AudioEncoding.WEBM_OPUS)
94
-
95
- # Alias for compatibility
96
- _get_google_encoding = _get_encoding
97
-
98
- async def stream_audio(self, audio_chunk: bytes) -> AsyncGenerator[TranscriptionResult, None]:
99
- """Stream audio chunk and get results"""
100
- if not self.is_streaming:
101
- raise Exception("Streaming not started")
102
-
103
- try:
104
- chunk_size = len(audio_chunk)
105
-
106
- # Log first chunk details
107
- if self.chunk_count == 0:
108
- log_info(f"πŸ“€ First chunk - size: {chunk_size} bytes")
109
- if audio_chunk.startswith(b'\x1a\x45\xdf\xa3'):
110
- log_info("βœ… Valid WEBM header detected")
111
- else:
112
- hex_preview = audio_chunk[:20].hex()
113
- log_warning(f"⚠️ Unexpected audio format. First 20 bytes: {hex_preview}")
114
-
115
- # Try to measure audio level (if it's raw PCM)
116
- try:
117
- if encoding_str == "LINEAR16": # Only for raw PCM
118
- audio_array = np.frombuffer(audio_chunk, dtype=np.int16)
119
- if len(audio_array) > 0:
120
- rms = np.sqrt(np.mean(audio_array.astype(float) ** 2))
121
- db = 20 * np.log10(max(rms, 1) / 32768.0)
122
- if self.chunk_count % 50 == 0:
123
- log_info(f"πŸ”Š Audio level: {db:.1f} dB")
124
- except:
125
- pass
126
-
127
- # Put chunk in queue
128
- self.audio_queue.put(audio_chunk)
129
- self.chunk_count += 1
130
- self.total_bytes += chunk_size
131
-
132
- # Log progress
133
- if self.chunk_count % 50 == 0:
134
- log_info(f"πŸ“€ Progress: {self.chunk_count} chunks, {self.total_bytes/1024:.1f}KB total")
135
-
136
- # Check for responses
137
- timeout = 0.1
138
- end_time = time.time() + timeout
139
-
140
- while time.time() < end_time:
141
- try:
142
- result = self.responses_queue.get_nowait()
143
- log_info(f"🎯 Got result from queue: is_final={result.is_final}, text='{result.text[:30]}...'")
144
- yield result
145
- except queue.Empty:
146
- await asyncio.sleep(0.01)
147
- except Exception as e:
148
- log_error(f"Error getting result from queue: {e}")
149
- break
150
-
151
- except Exception as e:
152
- log_error(f"❌ Error in stream_audio: {e}")
153
- raise
154
-
155
- async def stop_streaming(self) -> Optional[TranscriptionResult]:
156
- """Stop streaming and clean up all resources"""
157
- if not self.is_streaming and not self.stream_thread:
158
- log_debug("Already stopped, nothing to do")
159
- return None
160
-
161
- try:
162
- log_info(f"πŸ›‘ Stopping Google STT streaming session #{self.session_id}")
163
-
164
- # Set flags
165
- self.is_streaming = False
166
- self.should_stop = True
167
- self.stop_event.set()
168
-
169
- # Send poison pill
170
- if self.audio_queue:
171
- try:
172
- self.audio_queue.put(None)
173
- except:
174
- pass
175
-
176
- # Wait for thread
177
- if self.stream_thread and self.stream_thread.is_alive():
178
- log_info("⏳ Waiting for stream thread to finish...")
179
- self.stream_thread.join(timeout=5.0)
180
-
181
- if self.stream_thread.is_alive():
182
- log_warning("⚠️ STT thread did not stop gracefully after 5s")
183
- else:
184
- log_info("βœ… Stream thread finished")
185
-
186
- # Get final result
187
- final_result = None
188
- if self.responses_queue:
189
- while not self.responses_queue.empty():
190
- try:
191
- result = self.responses_queue.get_nowait()
192
- if result.is_final:
193
- final_result = result
194
- except queue.Empty:
195
- break
196
-
197
- # Close client
198
- if self.client:
199
- try:
200
- if hasattr(self.client, 'transport') and hasattr(self.client.transport, 'close'):
201
- self.client.transport.close()
202
- log_debug("βœ… Client transport closed")
203
-
204
- if hasattr(self.client, '_transport') and hasattr(self.client._transport, '_grpc_channel'):
205
- self.client._transport._grpc_channel.close()
206
- log_debug("βœ… gRPC channel closed")
207
- except Exception as e:
208
- log_warning(f"⚠️ Error closing Google client: {e}")
209
- finally:
210
- self.client = None
211
-
212
- # Reset state
213
- self.audio_queue = None
214
- self.responses_queue = None
215
- self.stream_thread = None
216
- self.streaming_config = None
217
- self.stop_event.clear()
218
-
219
- log_info(f"βœ… Google STT streaming session #{self.session_id} stopped and cleaned")
220
- return final_result
221
-
222
- except Exception as e:
223
- log_error(f"❌ Error during stop_streaming", error=str(e))
224
- self.is_streaming = False
225
- self.stream_thread = None
226
- self.client = None
227
- self.streaming_config = None
228
- self.stop_event.clear()
229
- self.audio_queue = None
230
- self.responses_queue = None
231
- return None
232
-
233
- def supports_realtime(self) -> bool:
234
- """Google Cloud STT supports real-time streaming"""
235
- return True
236
-
237
- def get_supported_languages(self) -> List[str]:
238
- """Get list of supported language codes"""
239
- return [
240
- "tr-TR", "en-US", "en-GB", "de-DE", "fr-FR", "es-ES",
241
- "it-IT", "pt-BR", "ru-RU", "ja-JP", "ko-KR", "zh-CN", "ar-SA"
242
- ]
243
-
244
- def get_provider_name(self) -> str:
245
- """Get provider name"""
246
- return "google"
247
-
248
- def _reset_session(self):
249
- """Reset session data"""
250
- # Clear queues
251
- while not self.audio_queue.empty():
252
- try:
253
- self.audio_queue.get_nowait()
254
- except queue.Empty:
255
- break
256
-
257
- while not self.responses_queue.empty():
258
- try:
259
- self.responses_queue.get_nowait()
260
- except queue.Empty:
261
- break
262
-
263
- # Reset state
264
- self.should_stop = False
265
- self.error_message = None
266
- self.session_id += 1
267
- self.stream_start_time = time.time()
268
- self.chunk_count = 0
269
- self.total_bytes = 0
270
-
271
- log_info(f"πŸ”„ Google STT session data reset. New session ID: {self.session_id}")
272
-
273
- # Create fresh queues
274
- self.audio_queue = queue.Queue()
275
- self.responses_queue = queue.Queue()
276
- log_debug("βœ… Created fresh queues")
277
-
278
- def _create_fresh_queues(self):
279
- """Create fresh queue instances"""
280
- if self.audio_queue:
281
- while not self.audio_queue.empty():
282
- try:
283
- self.audio_queue.get_nowait()
284
- except:
285
- pass
286
-
287
- if self.responses_queue:
288
- while not self.responses_queue.empty():
289
- try:
290
- self.responses_queue.get_nowait()
291
- except:
292
- pass
293
-
294
- self.audio_queue = queue.Queue(maxsize=1000)
295
- self.responses_queue = queue.Queue(maxsize=100)
296
- log_debug("βœ… Created fresh queues")
297
-
298
- def _request_generator(self):
299
- """Generate requests for the streaming recognize API"""
300
- # First request with config
301
- yield speech.StreamingRecognizeRequest(streaming_config=self.streaming_config)
302
-
303
- # Audio chunks
304
- while not self.should_stop:
305
- try:
306
- audio_chunk = self.audio_queue.get(timeout=0.1)
307
-
308
- if audio_chunk is None:
309
- log_info("πŸ“› Poison pill received, stopping request generator")
310
- break
311
-
312
- yield speech.StreamingRecognizeRequest(audio_content=audio_chunk)
313
-
314
- except queue.Empty:
315
- continue
316
- except Exception as e:
317
- log_error(f"Error in request generator: {e}")
318
- break
319
-
320
- log_info(f"πŸ“Š Request generator finished. Total chunks: {self.chunk_count}, Total bytes: {self.total_bytes}")
321
-
322
- async def start_streaming(self, config: STTConfig) -> None:
323
- """Initialize streaming session with clean state"""
324
- try:
325
- # Thread safety iΓ§in lock kullan
326
- async with asyncio.Lock():
327
- # Clean up any existing stream
328
- if self.is_streaming or self.stream_thread:
329
- log_warning("⚠️ Previous stream still active, stopping it first")
330
- await self.stop_streaming()
331
- await asyncio.sleep(0.5)
332
-
333
- # Double-check after cleanup
334
- if self.stream_thread and self.stream_thread.is_alive():
335
- log_error(f"❌ Stream thread STILL running after cleanup! Thread: {self.stream_thread.name}")
336
- raise Exception("Failed to stop previous stream thread")
337
-
338
- # Reset session
339
- self._reset_session()
340
- self.single_utterance = config.single_utterance
341
- self.current_encoding = config.encoding
342
-
343
- log_info(f"🎀 Starting Google STT streaming session #{self.session_id} with config: {config}")
344
-
345
- # Create fresh queues
346
- self._create_fresh_queues()
347
- self.stop_event.clear()
348
- self.should_stop = False
349
-
350
- # Create new client
351
- self.client = speech.SpeechClient()
352
- log_info("βœ… Created new Google Speech client")
353
-
354
- # Create recognition config
355
- recognition_config = speech.RecognitionConfig(
356
- encoding=speech.RecognitionConfig.AudioEncoding.WEBM_OPUS,
357
- sample_rate_hertz=16000,
358
- language_code="tr-TR",
359
- enable_automatic_punctuation=True,
360
- model="latest_long",
361
- use_enhanced=True,
362
- max_alternatives=1,
363
- metadata=speech.RecognitionMetadata(
364
- interaction_type=speech.RecognitionMetadata.InteractionType.VOICE_SEARCH,
365
- microphone_distance=speech.RecognitionMetadata.MicrophoneDistance.NEARFIELD,
366
- recording_device_type=speech.RecognitionMetadata.RecordingDeviceType.PC,
367
- )
368
- )
369
-
370
- # Create streaming config with VAD
371
- self.streaming_config = speech.StreamingRecognitionConfig(
372
- config=recognition_config,
373
- interim_results=True,
374
- single_utterance=False,
375
- enable_voice_activity_events=True # βœ… VAD events enabled
376
- )
377
-
378
- self.is_streaming = True
379
- self.stop_event.clear()
380
-
381
- # Thread başlatmadan ânce son kontrol
382
- if self.stream_thread is not None:
383
- log_error("❌ stream_thread should be None at this point!")
384
- self.stream_thread = None
385
-
386
- self.is_streaming = True
387
-
388
- # Start streaming thread with unique ID
389
- thread_id = f"GoogleSTT-Session-{self.session_id}-{int(time.time()*1000)}"
390
- self.stream_thread = threading.Thread(
391
- target=self._run_stream,
392
- name=thread_id
393
- )
394
- self.stream_thread.daemon = True
395
-
396
- log_info(f"πŸš€ Starting thread: {thread_id}")
397
- self.stream_thread.start()
398
-
399
- log_info(f"βœ… Google STT streaming session #{self.session_id} started successfully")
400
-
401
- except Exception as e:
402
- log_error(f"❌ Failed to start Google STT streaming", error=str(e))
403
- self.is_streaming = False
404
- self.client = None
405
- self._create_fresh_queues()
406
- raise
407
-
408
- def _run_stream(self):
409
- """Run the streaming recognition loop in a separate thread"""
410
- try:
411
- thread_id = threading.current_thread().ident
412
- log_info(f"🎀 Google STT stream thread started - Thread ID: {thread_id}, Session: {self.session_id}")
413
-
414
- # Create request generator
415
- requests = self._request_generator()
416
-
417
- # Create streaming client
418
- log_info(f"🎀 Creating Google STT streaming client... Thread ID: {thread_id}")
419
-
420
- # Get responses (no timeout parameter!)
421
- responses = self.client.streaming_recognize(requests)
422
-
423
- # Track responses
424
- first_response_time = None
425
- response_count = 0
426
-
427
- # Process responses
428
- for response in responses:
429
- if self.should_stop:
430
- log_info("πŸ›‘ Stop flag detected, ending stream")
431
- break
432
-
433
- response_count += 1
434
-
435
- if first_response_time is None:
436
- first_response_time = time.time()
437
- elapsed = first_response_time - self.stream_start_time
438
- log_info(f"πŸŽ‰ FIRST RESPONSE from Google STT after {elapsed:.2f}s")
439
-
440
- # Check for VAD events
441
- if hasattr(response, 'speech_event_type') and response.speech_event_type:
442
- event_type = response.speech_event_type
443
- log_info(f"πŸŽ™οΈ VAD Event: {event_type}")
444
-
445
- if event_type == speech.StreamingRecognizeResponse.SpeechEventType.END_OF_SINGLE_UTTERANCE:
446
- log_info("πŸ”š End of utterance detected by VAD")
447
-
448
- # Log response
449
- has_results = len(response.results) > 0 if hasattr(response, 'results') else False
450
- log_info(f"πŸ“¨ Google STT Response #{response_count}: has_results={has_results}")
451
-
452
- if not response.results:
453
- continue
454
-
455
- # Process results
456
- for result_idx, result in enumerate(response.results):
457
- # Check result type
458
- result_type = "πŸ”„ INTERIM" if not result.is_final else "βœ… FINAL"
459
- stability = getattr(result, 'stability', 0.0)
460
-
461
- log_info(f"{result_type} Result #{result_idx}: "
462
- f"alternatives={len(result.alternatives)}, "
463
- f"stability={stability:.3f}")
464
-
465
- if result.alternatives:
466
- best_alternative = result.alternatives[0]
467
- transcript = best_alternative.transcript
468
- confidence = best_alternative.confidence if result.is_final else stability
469
-
470
- # Log transcript
471
- if result.is_final:
472
- log_info(f"βœ… FINAL TRANSCRIPT: '{transcript}' "
473
- f"(confidence: {confidence:.3f})")
474
- else:
475
- log_info(f"πŸ”„ INTERIM TRANSCRIPT: '{transcript[:100]}...' "
476
- f"(stability: {stability:.3f})")
477
-
478
- # Queue result
479
- result_obj = TranscriptionResult(
480
- text=transcript,
481
- is_final=result.is_final,
482
- confidence=confidence,
483
- timestamp=datetime.utcnow()
484
- )
485
-
486
- self.responses_queue.put(result_obj)
487
- log_info(f"πŸ“₯ {'FINAL' if result.is_final else 'INTERIM'} result queued")
488
-
489
- # Log completion
490
- if response_count == 0:
491
- log_error("❌ Google STT stream ended without ANY responses!")
492
- else:
493
- log_info(f"βœ… Google STT stream ended normally after {response_count} responses")
494
-
495
- except Exception as e:
496
- log_error(f"❌ Google STT error: {e}")
497
- if hasattr(e, 'details'):
498
- log_error(f"Error details: {e.details}")
499
- self.error_message = str(e)
500
- finally:
501
- log_info("🎀 Google STT stream thread ended")
502
- with self.lock:
503
- self.is_streaming = False
 
1
+ """
2
+ Google Cloud Speech-to-Text Implementation
3
+ """
4
+ import asyncio
5
+ from typing import AsyncIterator, Optional, List, Any
6
+ from datetime import datetime
7
+ import queue
8
+ import threading
9
+ import traceback
10
+ import os
11
+ from google.cloud import speech
12
+ from google.cloud.speech import RecognitionConfig, StreamingRecognitionConfig
13
+ import google.auth
14
+ from utils.logger import log_info, log_error, log_debug, log_warning
15
+ from .stt_interface import STTInterface, STTConfig, TranscriptionResult
16
+
17
+ class GoogleSTT(STTInterface):
18
+ """Google Cloud Speech-to-Text implementation"""
19
+ def __init__(self, credentials_path: Optional[str] = None):
20
+ """
21
+ Initialize Google STT
22
+ Args:
23
+ credentials_path: Path to service account JSON file (optional if using default credentials)
24
+ """
25
+ try:
26
+ # Initialize client
27
+ if credentials_path:
28
+ self.client = speech.SpeechClient.from_service_account_file(credentials_path)
29
+ log_info(f"βœ… Google STT initialized with service account: {credentials_path}")
30
+ else:
31
+ # Use default credentials (ADC)
32
+ self.client = speech.SpeechClient()
33
+ log_info("βœ… Google STT initialized with default credentials")
34
+
35
+ # Streaming state
36
+ self.is_streaming = False
37
+ self.audio_generator = None
38
+ self.responses_stream = None
39
+ self.audio_queue = queue.Queue()
40
+ self.results_queue = queue.Queue(maxsize=100)
41
+
42
+ # Session tracking
43
+ self.session_id = 0
44
+ self.total_audio_bytes = 0
45
+ self.total_chunks = 0
46
+
47
+ # Threading
48
+ self.stream_thread = None
49
+ self.stop_event = threading.Event()
50
+
51
+ except Exception as e:
52
+ log_error(f"❌ Failed to initialize Google STT: {str(e)}")
53
+ raise
54
+
55
+ def _map_language_code(self, language: str) -> str:
56
+ """Map language codes to Google format"""
57
+ # Google uses BCP-47 language codes
58
+ language_map = {
59
+ "tr-TR": "tr-TR",
60
+ "en-US": "en-US",
61
+ "en-GB": "en-GB",
62
+ "de-DE": "de-DE",
63
+ "fr-FR": "fr-FR",
64
+ "es-ES": "es-ES",
65
+ "it-IT": "it-IT",
66
+ "pt-BR": "pt-BR",
67
+ "ru-RU": "ru-RU",
68
+ "ja-JP": "ja-JP",
69
+ "ko-KR": "ko-KR",
70
+ "zh-CN": "zh-CN",
71
+ "ar-SA": "ar-SA",
72
+ }
73
+ return language_map.get(language, language)
74
+
75
+ async def start_streaming(self, config: STTConfig) -> None:
76
+ """Initialize streaming session"""
77
+ try:
78
+ # Stop any existing stream
79
+ if self.is_streaming:
80
+ log_warning("⚠️ Previous stream still active, stopping it first")
81
+ await self.stop_streaming()
82
+ await asyncio.sleep(0.5)
83
+
84
+ # Reset session data
85
+ self._reset_session_data()
86
+
87
+ log_info(f"🎀 Starting Google STT - Session #{self.session_id}")
88
+
89
+ # Configure recognition settings
90
+ language_code = self._map_language_code(config.language)
91
+
92
+ # βœ… Google STT best practices for Turkish and single utterance
93
+ recognition_config = RecognitionConfig(
94
+ encoding=RecognitionConfig.AudioEncoding.LINEAR16,
95
+ sample_rate_hertz=16000,
96
+ language_code=language_code,
97
+ # βœ… Single utterance iΓ§in ideal ayarlar
98
+ enable_automatic_punctuation=True,
99
+ # Model selection - latest_long for better accuracy
100
+ model="latest_long",
101
+ # Use enhanced model if available (better for Turkish)
102
+ use_enhanced=True,
103
+ # Single channel audio
104
+ audio_channel_count=1,
105
+ # Boost adaptation for better Turkish recognition
106
+ speech_contexts=[
107
+ speech.SpeechContext(
108
+ phrases=[], # Boş bırakıyoruz ama context var
109
+ boost=20.0
110
+ )
111
+ ],
112
+ # Alternative transcripts for debugging
113
+ max_alternatives=1,
114
+ # Profanity filter disabled for accuracy
115
+ profanity_filter=False,
116
+ # Enable speaker diarization if needed
117
+ enable_speaker_diarization=False,
118
+ # Word level confidence
119
+ enable_word_confidence=False,
120
+ enable_spoken_punctuation=False,
121
+ enable_spoken_emojis=False,
122
+ )
123
+
124
+ # βœ… Streaming config - optimized for final results only
125
+ self.streaming_config = StreamingRecognitionConfig(
126
+ config=recognition_config,
127
+ # βœ… Single utterance mode - stops after detecting speech end
128
+ single_utterance=True,
129
+ # βœ… No interim results - only final
130
+ interim_results=False
131
+ )
132
+
133
+ log_info(f"πŸ”§ Google STT config: language={language_code}, "
134
+ f"model=latest_long, enhanced=True, "
135
+ f"single_utterance=True, interim_results=False")
136
+
137
+ # Start streaming in background thread
138
+ self.stop_event.clear()
139
+ self.stream_thread = threading.Thread(
140
+ target=self._stream_recognition,
141
+ daemon=True
142
+ )
143
+ self.stream_thread.start()
144
+
145
+ self.is_streaming = True
146
+ log_info(f"βœ… Google STT started - Ready for speech")
147
+
148
+ except Exception as e:
149
+ log_error(f"❌ Failed to start Google STT", error=str(e))
150
+ self.is_streaming = False
151
+ raise
152
+
153
+ def _stream_recognition(self):
154
+ """Background thread for streaming recognition"""
155
+ try:
156
+ log_debug("πŸŽ™οΈ Starting recognition stream thread")
157
+
158
+ # Create audio generator
159
+ audio_generator = self._audio_generator()
160
+
161
+ # Start streaming recognition
162
+ responses = self.client.streaming_recognize(
163
+ self.streaming_config,
164
+ audio_generator
165
+ )
166
+
167
+ # Process responses
168
+ for response in responses:
169
+ if self.stop_event.is_set():
170
+ break
171
+
172
+ if not response.results:
173
+ continue
174
+
175
+ # Process each result
176
+ for result in response.results:
177
+ if not result.alternatives:
178
+ continue
179
+
180
+ # Get best alternative
181
+ alternative = result.alternatives[0]
182
+
183
+ # Only process if we have transcript
184
+ if alternative.transcript:
185
+ transcription_result = TranscriptionResult(
186
+ text=alternative.transcript,
187
+ is_final=result.is_final,
188
+ confidence=alternative.confidence,
189
+ timestamp=datetime.now().timestamp()
190
+ )
191
+
192
+ try:
193
+ self.results_queue.put(transcription_result)
194
+
195
+ if result.is_final:
196
+ log_info(f"🎯 FINAL TRANSCRIPT: '{alternative.transcript}' "
197
+ f"(confidence: {alternative.confidence:.2f})")
198
+ # Single utterance mode will end stream after this
199
+ break
200
+ else:
201
+ # This shouldn't happen with interim_results=False
202
+ log_debug(f"πŸ“ Transcript: '{alternative.transcript}'")
203
+
204
+ except queue.Full:
205
+ log_warning("⚠️ Results queue full")
206
+
207
+ # Check if stream ended due to single_utterance
208
+ if hasattr(response, 'speech_event_type'):
209
+ if response.speech_event_type == speech.StreamingRecognizeResponse.SpeechEventType.END_OF_SINGLE_UTTERANCE:
210
+ log_info("πŸ”š End of single utterance detected")
211
+ break
212
+
213
+ except Exception as e:
214
+ if not self.stop_event.is_set():
215
+ log_error(f"❌ Recognition stream error: {str(e)}")
216
+ # Put error in queue
217
+ error_result = TranscriptionResult(
218
+ text="",
219
+ is_final=True,
220
+ confidence=0.0,
221
+ timestamp=datetime.now().timestamp()
222
+ )
223
+ self.results_queue.put(error_result)
224
+ finally:
225
+ log_debug("πŸŽ™οΈ Recognition stream thread ended")
226
+ self.is_streaming = False
227
+
228
+ def _audio_generator(self):
229
+ """Generator that yields audio chunks for streaming"""
230
+ while not self.stop_event.is_set():
231
+ try:
232
+ # Get audio chunk with timeout
233
+ chunk = self.audio_queue.get(timeout=0.1)
234
+
235
+ if chunk is None: # Sentinel value
236
+ break
237
+
238
+ yield chunk
239
+
240
+ except queue.Empty:
241
+ continue
242
+ except Exception as e:
243
+ log_error(f"❌ Audio generator error: {str(e)}")
244
+ break
245
+
246
+ async def stream_audio(self, audio_chunk: bytes) -> AsyncIterator[TranscriptionResult]:
247
+ """Stream audio chunk and get transcription results"""
248
+ if not self.is_streaming:
249
+ raise RuntimeError("Streaming not started. Call start_streaming() first.")
250
+
251
+ try:
252
+ # Add audio to queue for background thread
253
+ self.audio_queue.put(audio_chunk)
254
+
255
+ self.total_chunks += 1
256
+ self.total_audio_bytes += len(audio_chunk)
257
+
258
+ # Log progress
259
+ if self.total_chunks % 50 == 0:
260
+ log_debug(f"πŸ“Š Processing... {self.total_chunks} chunks, {self.total_audio_bytes/1024:.1f}KB")
261
+
262
+ # Check for results
263
+ while True:
264
+ try:
265
+ result = self.results_queue.get_nowait()
266
+
267
+ # Log for debugging
268
+ log_debug(f"🎯 Yielding result: is_final={result.is_final}, text='{result.text}'")
269
+
270
+ yield result
271
+
272
+ # If final result, stream will end
273
+ if result.is_final:
274
+ self.is_streaming = False
275
+
276
+ except queue.Empty:
277
+ break
278
+
279
+ except Exception as e:
280
+ log_error(f"❌ Error streaming audio", error=str(e))
281
+ self.is_streaming = False
282
+ raise
283
+
284
+ async def stop_streaming(self) -> Optional[TranscriptionResult]:
285
+ """Stop streaming and clean up"""
286
+ if not self.is_streaming:
287
+ log_debug("Already stopped, nothing to do")
288
+ return None
289
+
290
+ try:
291
+ log_info(f"πŸ›‘ Stopping Google STT session #{self.session_id}")
292
+
293
+ self.is_streaming = False
294
+
295
+ # Signal stop
296
+ self.stop_event.set()
297
+
298
+ # Send sentinel to audio queue
299
+ self.audio_queue.put(None)
300
+
301
+ # Wait for thread to finish
302
+ if self.stream_thread and self.stream_thread.is_alive():
303
+ self.stream_thread.join(timeout=2.0)
304
+
305
+ # Get final result if any
306
+ final_result = None
307
+ while not self.results_queue.empty():
308
+ try:
309
+ result = self.results_queue.get_nowait()
310
+ if result.is_final and result.text:
311
+ final_result = result
312
+ except queue.Empty:
313
+ break
314
+
315
+ log_info(f"βœ… Google STT session #{self.session_id} stopped")
316
+ return final_result
317
+
318
+ except Exception as e:
319
+ log_error(f"❌ Error during stop_streaming", error=str(e))
320
+ self.is_streaming = False
321
+ return None
322
+
323
+ def _reset_session_data(self):
324
+ """Reset session-specific data"""
325
+ # Clear queues
326
+ while not self.audio_queue.empty():
327
+ try:
328
+ self.audio_queue.get_nowait()
329
+ except:
330
+ pass
331
+
332
+ while not self.results_queue.empty():
333
+ try:
334
+ self.results_queue.get_nowait()
335
+ except:
336
+ pass
337
+
338
+ # Reset counters
339
+ self.total_audio_bytes = 0
340
+ self.total_chunks = 0
341
+ self.session_id += 1
342
+
343
+ log_debug(f"πŸ”„ Session data reset. New session ID: {self.session_id}")
344
+
345
+ def supports_realtime(self) -> bool:
346
+ """Google STT supports real-time streaming"""
347
+ return True
348
+
349
+ def get_supported_languages(self) -> List[str]:
350
+ """Get list of supported language codes"""
351
+ # Google Cloud Speech-to-Text supported languages (partial list)
352
+ # Full list: https://cloud.google.com/speech-to-text/docs/languages
353
+ return [
354
+ "tr-TR", "en-US", "en-GB", "en-AU", "en-CA", "en-IN",
355
+ "es-ES", "es-MX", "es-AR", "fr-FR", "fr-CA", "de-DE",
356
+ "it-IT", "pt-BR", "pt-PT", "ru-RU", "ja-JP", "ko-KR",
357
+ "zh-CN", "zh-TW", "ar-SA", "ar-EG", "hi-IN", "nl-NL",
358
+ "pl-PL", "sv-SE", "da-DK", "no-NO", "fi-FI", "el-GR",
359
+ "he-IL", "th-TH", "vi-VN", "id-ID", "ms-MY", "fil-PH"
360
+ ]
361
+
362
+ def get_provider_name(self) -> str:
363
+ """Get provider name"""
364
+ return "google"