ciyidogan commited on
Commit
89d0af3
·
verified ·
1 Parent(s): c4954b5

Update stt/stt_google.py

Browse files
Files changed (1) hide show
  1. stt/stt_google.py +85 -85
stt/stt_google.py CHANGED
@@ -177,96 +177,96 @@ class GoogleSTT(STTInterface):
177
  log_error(f"❌ Silence trimming failed: {e}")
178
  return audio_data
179
 
180
- async def transcribe(self, audio_data: bytes, config: STTConfig) -> Optional[TranscriptionResult]:
181
- """Transcribe audio data using Google Cloud Speech API"""
182
- try:
183
- # Check if we have audio to transcribe
184
- if not audio_data:
185
- log_warning("⚠️ No audio data provided")
186
- return None
187
-
188
- log_info(f"📊 Transcribing {len(audio_data)} bytes of audio")
189
-
190
- # ✅ Audio analizi
191
- self._analyze_audio_content(audio_data)
192
-
193
- # ✅ Silence trimming ekle
194
- trimmed_audio = self._trim_silence(audio_data)
195
-
196
- if len(trimmed_audio) < 8000: # 0.5 saniyeden az
197
- log_warning("⚠️ Audio too short after trimming")
198
- return None
199
-
200
- # ✅ Test kodundan EXACT aynı format - wave modülü kullan
201
- wav_audio = self._create_wav_like_test(trimmed_audio, config.sample_rate)
202
- log_info(f"🔧 WAV conversion: {len(trimmed_audio)} PCM → {len(wav_audio)} WAV")
203
-
204
- # Configure recognition - TEST KODUNDAN EXACT AYNI
205
- recognition_config = RecognitionConfig(
206
- encoding=RecognitionConfig.AudioEncoding.LINEAR16,
207
- sample_rate_hertz=config.sample_rate,
208
- language_code="tr-TR", # Hardcode tr-TR like test
209
- audio_channel_count=1,
210
- enable_separate_recognition_per_channel=False,
211
- )
212
 
213
- log_debug(f"Recognition config: language=tr-TR, sample_rate={config.sample_rate}")
214
-
215
- # ✅ Create audio object with WAV data
216
- audio = RecognitionAudio(content=wav_audio)
217
-
218
- # Perform synchronous recognition
219
- log_info(f"🔄 Sending {len(wav_audio)} bytes WAV to Google Cloud Speech API...")
220
- response = self.client.recognize(config=recognition_config, audio=audio)
221
-
222
- # ✅ Detaylı response analizi
223
- log_debug(f"API Response: {response}")
224
- log_info(f"🔍 Google response details:")
225
- log_info(f"- Has results: {bool(response.results)}")
226
- log_info(f"- Results count: {len(response.results)}")
227
-
228
- # ✅ Request ID'yi logla
229
- if hasattr(response, '_pb') and hasattr(response._pb, 'request_id'):
230
- log_info(f"- Request ID: {response._pb.request_id}")
231
-
232
- if hasattr(response, 'total_billed_time'):
233
- billed_seconds = response.total_billed_time.total_seconds()
234
- log_info(f"- Billed time: {billed_seconds}s")
235
 
236
- # ✅ Eğer billed time 0 ise, Google hiç audio işlememiş demektir
237
- if billed_seconds == 0:
238
- log_error("❌ Google didn't process any audio - possible format issue")
 
 
239
  return None
240
- else:
241
- log_info(f"- Billed time: 0s (no audio processed)")
242
-
243
- # Process results
244
- if response.results:
245
- for i, result in enumerate(response.results):
246
- log_debug(f"Result {i}: {result}")
 
 
 
 
 
 
247
 
248
- if result.alternatives:
249
- alternative = result.alternatives[0]
250
-
251
- transcription = TranscriptionResult(
252
- text=alternative.transcript,
253
- confidence=alternative.confidence,
254
- timestamp=datetime.now().timestamp(),
255
- language="tr-TR",
256
- word_timestamps=None
257
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
 
259
- log_info(f"✅ Transcription: '{alternative.transcript}' (confidence: {alternative.confidence:.2f})")
260
- return transcription
261
-
262
- log_warning("⚠️ No transcription results - Google couldn't recognize speech")
263
- return None
264
-
265
- except Exception as e:
266
- log_error(f"❌ Error during transcription: {str(e)}")
267
- import traceback
268
- log_error(f"Traceback: {traceback.format_exc()}")
269
- return None
 
 
 
 
 
 
 
 
 
 
 
270
 
271
  def _create_wav_like_test(self, audio_data: bytes, sample_rate: int) -> bytes:
272
  """Create WAV exactly like test code using wave module"""
 
177
  log_error(f"❌ Silence trimming failed: {e}")
178
  return audio_data
179
 
180
+ async def transcribe(self, audio_data: bytes, config: STTConfig) -> Optional[TranscriptionResult]:
181
+ """Transcribe audio data using Google Cloud Speech API"""
182
+ try:
183
+ # Check if we have audio to transcribe
184
+ if not audio_data:
185
+ log_warning("⚠️ No audio data provided")
186
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
 
188
+ log_info(f"📊 Transcribing {len(audio_data)} bytes of audio")
189
+
190
+ # ✅ Audio analizi
191
+ self._analyze_audio_content(audio_data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
 
193
+ # ✅ Silence trimming ekle
194
+ trimmed_audio = self._trim_silence(audio_data)
195
+
196
+ if len(trimmed_audio) < 8000: # 0.5 saniyeden az
197
+ log_warning("⚠️ Audio too short after trimming")
198
  return None
199
+
200
+ # Test kodundan EXACT aynı format - wave modülü kullan
201
+ wav_audio = self._create_wav_like_test(trimmed_audio, config.sample_rate)
202
+ log_info(f"🔧 WAV conversion: {len(trimmed_audio)} PCM → {len(wav_audio)} WAV")
203
+
204
+ # Configure recognition - TEST KODUNDAN EXACT AYNI
205
+ recognition_config = RecognitionConfig(
206
+ encoding=RecognitionConfig.AudioEncoding.LINEAR16,
207
+ sample_rate_hertz=config.sample_rate,
208
+ language_code="tr-TR", # Hardcode tr-TR like test
209
+ audio_channel_count=1,
210
+ enable_separate_recognition_per_channel=False,
211
+ )
212
 
213
+ log_debug(f"Recognition config: language=tr-TR, sample_rate={config.sample_rate}")
214
+
215
+ # ✅ Create audio object with WAV data
216
+ audio = RecognitionAudio(content=wav_audio)
217
+
218
+ # Perform synchronous recognition
219
+ log_info(f"🔄 Sending {len(wav_audio)} bytes WAV to Google Cloud Speech API...")
220
+ response = self.client.recognize(config=recognition_config, audio=audio)
221
+
222
+ # ✅ Detaylı response analizi
223
+ log_debug(f"API Response: {response}")
224
+ log_info(f"🔍 Google response details:")
225
+ log_info(f"- Has results: {bool(response.results)}")
226
+ log_info(f"- Results count: {len(response.results)}")
227
+
228
+ # ✅ Request ID'yi logla
229
+ if hasattr(response, '_pb') and hasattr(response._pb, 'request_id'):
230
+ log_info(f"- Request ID: {response._pb.request_id}")
231
+
232
+ if hasattr(response, 'total_billed_time'):
233
+ billed_seconds = response.total_billed_time.total_seconds()
234
+ log_info(f"- Billed time: {billed_seconds}s")
235
+
236
+ # ✅ Eğer billed time 0 ise, Google hiç audio işlememiş demektir
237
+ if billed_seconds == 0:
238
+ log_error("❌ Google didn't process any audio - possible format issue")
239
+ return None
240
+ else:
241
+ log_info(f"- Billed time: 0s (no audio processed)")
242
+
243
+ # Process results
244
+ if response.results:
245
+ for i, result in enumerate(response.results):
246
+ log_debug(f"Result {i}: {result}")
247
 
248
+ if result.alternatives:
249
+ alternative = result.alternatives[0]
250
+
251
+ transcription = TranscriptionResult(
252
+ text=alternative.transcript,
253
+ confidence=alternative.confidence,
254
+ timestamp=datetime.now().timestamp(),
255
+ language="tr-TR",
256
+ word_timestamps=None
257
+ )
258
+
259
+ log_info(f"✅ Transcription: '{alternative.transcript}' (confidence: {alternative.confidence:.2f})")
260
+ return transcription
261
+
262
+ log_warning("⚠️ No transcription results - Google couldn't recognize speech")
263
+ return None
264
+
265
+ except Exception as e:
266
+ log_error(f"❌ Error during transcription: {str(e)}")
267
+ import traceback
268
+ log_error(f"Traceback: {traceback.format_exc()}")
269
+ return None
270
 
271
  def _create_wav_like_test(self, audio_data: bytes, sample_rate: int) -> bytes:
272
  """Create WAV exactly like test code using wave module"""