ciyidogan commited on
Commit
a532986
·
verified ·
1 Parent(s): 6aeaf3c

Update stt/stt_google.py

Browse files
Files changed (1) hide show
  1. stt/stt_google.py +69 -334
stt/stt_google.py CHANGED
@@ -1,19 +1,16 @@
1
  """
2
- Google Cloud Speech-to-Text Implementation
3
  """
4
- import asyncio
5
- from typing import AsyncIterator, Optional, List, Any
6
  from datetime import datetime
7
- import queue
8
- import threading
9
- import traceback
10
- import os
11
  from google.cloud import speech
12
- from google.cloud.speech import RecognitionConfig, StreamingRecognitionConfig
13
- import google.auth
14
  from utils.logger import log_info, log_error, log_debug, log_warning
15
  from .stt_interface import STTInterface, STTConfig, TranscriptionResult
16
 
 
17
  class GoogleSTT(STTInterface):
18
  def __init__(self, credentials_path: Optional[str] = None):
19
  """
@@ -21,16 +18,7 @@ class GoogleSTT(STTInterface):
21
  Args:
22
  credentials_path: Path to service account JSON file (optional if using default credentials)
23
  """
24
- try:
25
- # ✅ Debug için path kontrolü
26
- if credentials_path:
27
- import os
28
- if not os.path.exists(credentials_path):
29
- log_error(f"❌ Credentials file not found at: {credentials_path}")
30
- raise FileNotFoundError(f"Credentials file not found: {credentials_path}")
31
-
32
- log_info(f"📁 Using credentials from: {credentials_path}")
33
-
34
  # Initialize client
35
  if credentials_path:
36
  self.client = speech.SpeechClient.from_service_account_file(credentials_path)
@@ -40,22 +28,6 @@ class GoogleSTT(STTInterface):
40
  self.client = speech.SpeechClient()
41
  log_info("✅ Google STT initialized with default credentials")
42
 
43
- # Streaming state
44
- self.is_streaming = False
45
- self.audio_generator = None
46
- self.responses_stream = None
47
- self.audio_queue = queue.Queue()
48
- self.results_queue = queue.Queue(maxsize=100)
49
-
50
- # Session tracking
51
- self.session_id = 0
52
- self.total_audio_bytes = 0
53
- self.total_chunks = 0
54
-
55
- # Threading
56
- self.stream_thread = None
57
- self.stop_event = threading.Event()
58
-
59
  except Exception as e:
60
  log_error(f"❌ Failed to initialize Google STT: {str(e)}")
61
  raise
@@ -80,333 +52,96 @@ class GoogleSTT(STTInterface):
80
  }
81
  return language_map.get(language, language)
82
 
83
- async def start_streaming(self, config: STTConfig) -> None:
84
- """Initialize streaming session"""
85
  try:
86
- # Stop any existing stream
87
- if self.is_streaming:
88
- log_warning("⚠️ Previous stream still active, stopping it first")
89
- await self.stop_streaming()
90
- await asyncio.sleep(0.5)
91
 
92
- # Reset session data
93
- self._reset_session_data()
94
 
95
- log_info(f"🎤 Starting Google STT - Session #{self.session_id}")
 
96
 
97
- # Configure recognition settings
98
  language_code = self._map_language_code(config.language)
99
-
100
- """
101
- # ✅ Google STT best practices for Turkish and single utterance
102
- recognition_config = RecognitionConfig(
103
- encoding=RecognitionConfig.AudioEncoding.LINEAR16,
104
- sample_rate_hertz=16000,
105
- language_code="tr-TR",
106
- # ✅ Single utterance için ideal ayarlar
107
- enable_automatic_punctuation=True,
108
- # Model selection - latest_long for better accuracy
109
- model="latest_long",
110
- # Use enhanced model if available (better for Turkish)
111
- use_enhanced=True,
112
- # Single channel audio
113
- audio_channel_count=1,
114
- # Alternative transcripts for debugging
115
- max_alternatives=1,
116
- # Profanity filter disabled for accuracy
117
- profanity_filter=False,
118
- # Word level confidence
119
- enable_word_confidence=False,
120
- enable_spoken_punctuation=False,
121
- enable_spoken_emojis=False,
122
- )
123
 
124
- # ✅ Streaming config - optimized for final results only
125
- self.streaming_config = StreamingRecognitionConfig(
126
- config=recognition_config,
127
- single_utterance=False,
128
- interim_results=True
129
- )
130
- """
131
-
132
- # ✅ EN BASİT CONFIG - sadece zorunlu alanlar
133
  recognition_config = RecognitionConfig(
134
  encoding=RecognitionConfig.AudioEncoding.LINEAR16,
135
- sample_rate_hertz=16000,
136
- language_code="tr-TR"
 
 
 
 
137
  )
138
 
139
- # Streaming config - en basit hali
140
- self.streaming_config = StreamingRecognitionConfig(
141
- config=recognition_config,
142
- interim_results=True
143
- )
144
 
145
- log_info(f"🔧 Google STT config: language={language_code}, "
146
- f"model=latest_long, enhanced=True, "
147
- f"single_utterance=True, interim_results=False")
148
 
149
- # Start streaming in background thread
150
- self.stop_event.clear()
151
- self.stream_thread = threading.Thread(
152
- target=self._stream_recognition,
153
- daemon=True
154
- )
155
- self.stream_thread.start()
156
-
157
- self.is_streaming = True
158
- log_info(f"✅ Google STT started - Ready for speech")
159
-
160
- except Exception as e:
161
- log_error(f"❌ Failed to start Google STT", error=str(e))
162
- self.is_streaming = False
163
- raise
164
-
165
- def _stream_recognition(self):
166
- """Background thread for streaming recognition"""
167
- try:
168
- log_debug("🎙️ Starting recognition stream thread")
169
-
170
- # ✅ Config'i logla
171
- log_debug(f"Config details: {self.streaming_config}")
172
-
173
- # Create audio generator
174
- audio_generator = self._audio_generator()
175
-
176
- # ✅ Daha detaylı hata yakalama
177
- try:
178
- # Start streaming recognition
179
- responses = self.client.streaming_recognize(
180
- self.streaming_config,
181
- audio_generator
182
- )
183
- except Exception as api_error:
184
- log_error(f"❌ Google API error: {str(api_error)}")
185
- log_error(f"❌ Error type: {type(api_error).__name__}")
186
- if hasattr(api_error, 'details'):
187
- log_error(f"❌ Error details: {api_error.details()}")
188
- if hasattr(api_error, '__dict__'):
189
- log_error(f"❌ Error attributes: {api_error.__dict__}")
190
- import traceback
191
- log_error(f"❌ Full traceback: {traceback.format_exc()}")
192
- raise
193
-
194
- # Process responses
195
- for response in responses:
196
- if self.stop_event.is_set():
197
- break
198
-
199
- if not response.results:
200
- continue
201
-
202
- # Process each result
203
- for result in response.results:
204
- if not result.alternatives:
205
- continue
206
-
207
- # Get best alternative
208
  alternative = result.alternatives[0]
209
 
210
- # Only process if we have transcript
211
- if alternative.transcript:
212
- # Interim result'ları logla ama queue'ya koyma
213
- if not result.is_final:
214
- log_debug(f"📝 Interim transcript (ignored): '{alternative.transcript}'")
215
- continue
216
-
217
- # ✅ Sadece final result'ları işle
218
- transcription_result = TranscriptionResult(
219
- text=alternative.transcript,
220
- is_final=result.is_final,
221
- confidence=alternative.confidence,
222
- timestamp=datetime.now().timestamp()
223
- )
224
-
225
- try:
226
- self.results_queue.put(transcription_result)
227
-
228
- if result.is_final:
229
- log_info(f"🎯 FINAL TRANSCRIPT: '{alternative.transcript}' "
230
- f"(confidence: {alternative.confidence:.2f})")
231
- # Single utterance mode will end stream after this
232
- break
233
- else:
234
- # This shouldn't happen with interim_results=False
235
- log_debug(f"📝 Transcript: '{alternative.transcript}'")
236
-
237
- except queue.Full:
238
- log_warning("⚠️ Results queue full")
239
-
240
- # Check if stream ended due to single_utterance
241
- if hasattr(response, 'speech_event_type'):
242
- if response.speech_event_type == speech.StreamingRecognizeResponse.SpeechEventType.END_OF_SINGLE_UTTERANCE:
243
- log_info("🔚 End of single utterance detected")
244
- break
245
-
246
- except Exception as e:
247
- if not self.stop_event.is_set():
248
- log_error(f"❌ Recognition stream error: {str(e)}")
249
- # Put error in queue
250
- error_result = TranscriptionResult(
251
- text="",
252
- is_final=True,
253
- confidence=0.0,
254
- timestamp=datetime.now().timestamp()
255
- )
256
- self.results_queue.put(error_result)
257
- finally:
258
- log_debug("🎙️ Recognition stream thread ended")
259
- self.is_streaming = False
260
-
261
- def _audio_generator(self):
262
- """Generator that yields audio chunks for streaming"""
263
- chunk_count = 0
264
- try:
265
- while not self.stop_event.is_set():
266
- try:
267
- # Get audio chunk with timeout
268
- chunk = self.audio_queue.get(timeout=0.1)
269
-
270
- if chunk is None: # Sentinel value
271
- log_debug("🔚 Audio generator received sentinel, stopping")
272
- break
273
-
274
- # ✅ Debug için chunk bilgisi
275
- chunk_count += 1
276
- if chunk_count <= 5: # İlk 5 chunk için detaylı log
277
- log_debug(f"🎵 Audio generator yielding chunk #{chunk_count}, size: {len(chunk)} bytes")
278
- # Chunk'ın byte tipinde olduğundan emin ol
279
- if not isinstance(chunk, bytes):
280
- log_error(f"❌ Chunk is not bytes! Type: {type(chunk)}")
281
- continue
282
 
283
- # Google API'nin beklediği format
284
- yield chunk
 
 
 
 
 
285
 
286
- except queue.Empty:
287
- continue
288
- except Exception as e:
289
- log_error(f"❌ Audio generator error: {str(e)}")
290
- break
291
- finally:
292
- log_debug(f"🎙️ Audio generator stopped after {chunk_count} chunks")
293
-
294
- async def stream_audio(self, audio_chunk: bytes) -> AsyncIterator[TranscriptionResult]:
295
- """Stream audio chunk and get transcription results"""
296
- if not self.is_streaming:
297
- raise RuntimeError("Streaming not started. Call start_streaming() first.")
298
-
299
- try:
300
- # ✅ Audio chunk tipini kontrol et
301
- if not isinstance(audio_chunk, bytes):
302
- log_error(f"❌ Audio chunk is not bytes! Type: {type(audio_chunk)}")
303
- raise TypeError(f"Expected bytes, got {type(audio_chunk)}")
304
 
305
- # Chunk boyutunu logla
306
- if self.total_chunks < 5:
307
- log_debug(f"📦 Adding audio chunk #{self.total_chunks} to queue, size: {len(audio_chunk)} bytes")
308
-
309
- # Add audio to queue for background thread
310
- self.audio_queue.put(audio_chunk)
311
-
312
- self.total_chunks += 1
313
- self.total_audio_bytes += len(audio_chunk)
314
-
315
- # Log progress
316
- if self.total_chunks % 50 == 0:
317
- log_debug(f"📊 Processing... {self.total_chunks} chunks, {self.total_audio_bytes/1024:.1f}KB")
318
-
319
- # Check for results
320
- while True:
321
- try:
322
- result = self.results_queue.get_nowait()
323
-
324
- # Log for debugging
325
- log_debug(f"🎯 Yielding result: is_final={result.is_final}, text='{result.text}'")
326
-
327
- yield result
328
-
329
- # If final result, stream will end
330
- if result.is_final:
331
- self.is_streaming = False
332
-
333
- except queue.Empty:
334
- break
335
-
336
- except Exception as e:
337
- log_error(f"❌ Error streaming audio", error=str(e))
338
- self.is_streaming = False
339
- raise
340
-
341
- async def stop_streaming(self) -> Optional[TranscriptionResult]:
342
- """Stop streaming and clean up"""
343
- if not self.is_streaming:
344
- log_debug("Already stopped, nothing to do")
345
  return None
346
 
347
- try:
348
- log_info(f"🛑 Stopping Google STT session #{self.session_id}")
349
-
350
- self.is_streaming = False
351
-
352
- # Signal stop
353
- self.stop_event.set()
354
-
355
- # Send sentinel to audio queue
356
- self.audio_queue.put(None)
357
-
358
- # Wait for thread to finish
359
- if self.stream_thread and self.stream_thread.is_alive():
360
- self.stream_thread.join(timeout=2.0)
361
-
362
- # Get final result if any
363
- final_result = None
364
- while not self.results_queue.empty():
365
- try:
366
- result = self.results_queue.get_nowait()
367
- if result.is_final and result.text:
368
- final_result = result
369
- except queue.Empty:
370
- break
371
-
372
- log_info(f"✅ Google STT session #{self.session_id} stopped")
373
- return final_result
374
-
375
  except Exception as e:
376
- log_error(f"❌ Error during stop_streaming", error=str(e))
377
- self.is_streaming = False
 
378
  return None
379
 
380
- def _reset_session_data(self):
381
- """Reset session-specific data"""
382
- # Clear queues
383
- while not self.audio_queue.empty():
384
- try:
385
- self.audio_queue.get_nowait()
386
- except:
387
- pass
388
-
389
- while not self.results_queue.empty():
390
- try:
391
- self.results_queue.get_nowait()
392
- except:
393
- pass
394
 
395
- # Reset counters
396
- self.total_audio_bytes = 0
397
- self.total_chunks = 0
398
- self.session_id += 1
 
 
399
 
400
- log_debug(f"🔄 Session data reset. New session ID: {self.session_id}")
401
-
402
- def supports_realtime(self) -> bool:
403
- """Google STT supports real-time streaming"""
404
- return True
405
 
406
  def get_supported_languages(self) -> List[str]:
407
  """Get list of supported language codes"""
408
  # Google Cloud Speech-to-Text supported languages (partial list)
409
- # Full list: https://cloud.google.com/speech-to-text/docs/languages
410
  return [
411
  "tr-TR", "en-US", "en-GB", "en-AU", "en-CA", "en-IN",
412
  "es-ES", "es-MX", "es-AR", "fr-FR", "fr-CA", "de-DE",
 
1
  """
2
+ Google Cloud Speech-to-Text Implementation - Simple Batch Mode
3
  """
4
+ from typing import Optional, List
 
5
  from datetime import datetime
6
+ import io
7
+ import wave
 
 
8
  from google.cloud import speech
9
+ from google.cloud.speech import RecognitionConfig, RecognitionAudio
 
10
  from utils.logger import log_info, log_error, log_debug, log_warning
11
  from .stt_interface import STTInterface, STTConfig, TranscriptionResult
12
 
13
+
14
  class GoogleSTT(STTInterface):
15
  def __init__(self, credentials_path: Optional[str] = None):
16
  """
 
18
  Args:
19
  credentials_path: Path to service account JSON file (optional if using default credentials)
20
  """
21
+ try:
 
 
 
 
 
 
 
 
 
22
  # Initialize client
23
  if credentials_path:
24
  self.client = speech.SpeechClient.from_service_account_file(credentials_path)
 
28
  self.client = speech.SpeechClient()
29
  log_info("✅ Google STT initialized with default credentials")
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  except Exception as e:
32
  log_error(f"❌ Failed to initialize Google STT: {str(e)}")
33
  raise
 
52
  }
53
  return language_map.get(language, language)
54
 
55
+ async def transcribe(self, audio_data: bytes, config: STTConfig) -> Optional[TranscriptionResult]:
56
+ """Transcribe audio data using Google Cloud Speech API"""
57
  try:
58
+ # Check if we have audio to transcribe
59
+ if not audio_data:
60
+ log_warning("⚠️ No audio data provided")
61
+ return None
 
62
 
63
+ log_info(f"📊 Transcribing {len(audio_data)} bytes of audio")
 
64
 
65
+ # Convert to WAV format for better compatibility
66
+ wav_audio = self._convert_to_wav(audio_data, config.sample_rate)
67
 
68
+ # Configure recognition
69
  language_code = self._map_language_code(config.language)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
 
 
 
 
 
 
 
 
 
71
  recognition_config = RecognitionConfig(
72
  encoding=RecognitionConfig.AudioEncoding.LINEAR16,
73
+ sample_rate_hertz=config.sample_rate,
74
+ language_code=language_code,
75
+ enable_automatic_punctuation=config.enable_punctuation,
76
+ model=config.model,
77
+ use_enhanced=config.use_enhanced,
78
+ enable_word_time_offsets=config.enable_word_timestamps,
79
  )
80
 
81
+ # Create audio object
82
+ audio = RecognitionAudio(content=wav_audio)
 
 
 
83
 
84
+ # Perform synchronous recognition
85
+ log_info(f"🔄 Sending audio to Google Cloud Speech API...")
86
+ response = self.client.recognize(config=recognition_config, audio=audio)
87
 
88
+ # Process results
89
+ if response.results:
90
+ result = response.results[0]
91
+ if result.alternatives:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  alternative = result.alternatives[0]
93
 
94
+ # Extract word timestamps if available
95
+ word_timestamps = None
96
+ if config.enable_word_timestamps and hasattr(alternative, 'words'):
97
+ word_timestamps = [
98
+ {
99
+ "word": word_info.word,
100
+ "start_time": word_info.start_time.total_seconds(),
101
+ "end_time": word_info.end_time.total_seconds()
102
+ }
103
+ for word_info in alternative.words
104
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
+ transcription = TranscriptionResult(
107
+ text=alternative.transcript,
108
+ confidence=alternative.confidence,
109
+ timestamp=datetime.now().timestamp(),
110
+ language=language_code,
111
+ word_timestamps=word_timestamps
112
+ )
113
 
114
+ log_info(f"✅ Transcription: '{alternative.transcript}' (confidence: {alternative.confidence:.2f})")
115
+ return transcription
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
+ log_warning("⚠️ No transcription results")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  return None
119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  except Exception as e:
121
+ log_error(f"❌ Error during transcription: {str(e)}")
122
+ import traceback
123
+ log_error(f"Traceback: {traceback.format_exc()}")
124
  return None
125
 
126
+ def _convert_to_wav(self, audio_data: bytes, sample_rate: int) -> bytes:
127
+ """Convert raw PCM audio to WAV format"""
128
+ # Create WAV file in memory
129
+ wav_buffer = io.BytesIO()
 
 
 
 
 
 
 
 
 
 
130
 
131
+ with wave.open(wav_buffer, 'wb') as wav_file:
132
+ # Set WAV parameters
133
+ wav_file.setnchannels(1) # Mono
134
+ wav_file.setsampwidth(2) # 16-bit
135
+ wav_file.setframerate(sample_rate)
136
+ wav_file.writeframes(audio_data)
137
 
138
+ # Get WAV data
139
+ wav_buffer.seek(0)
140
+ return wav_buffer.read()
 
 
141
 
142
  def get_supported_languages(self) -> List[str]:
143
  """Get list of supported language codes"""
144
  # Google Cloud Speech-to-Text supported languages (partial list)
 
145
  return [
146
  "tr-TR", "en-US", "en-GB", "en-AU", "en-CA", "en-IN",
147
  "es-ES", "es-MX", "es-AR", "fr-FR", "fr-CA", "de-DE",