ciyidogan commited on
Commit
6abf273
·
verified ·
1 Parent(s): acaa42c

Update stt/stt_google.py

Browse files
Files changed (1) hide show
  1. stt/stt_google.py +91 -57
stt/stt_google.py CHANGED
@@ -67,98 +67,132 @@ class GoogleSTT(STTInterface):
67
  return language_map.get(language, language)
68
 
69
  async def transcribe(self, audio_data: bytes, config: STTConfig) -> Optional[TranscriptionResult]:
70
- """Transcribe audio data using Google Cloud Speech API"""
71
  try:
72
- # Check if we have audio to transcribe
73
  if not audio_data:
74
  log_warning("⚠️ No audio data provided")
75
  return None
76
 
77
  log_info(f"📊 Transcribing {len(audio_data)} bytes of audio")
78
 
79
- # ✅ Debug - audio verisi analizi
80
- if len(audio_data) > 100:
81
- # İlk ve son 50 byte'ı kontrol et
82
- first_50 = audio_data[:50]
83
- last_50 = audio_data[-50:]
84
- log_debug(f"Audio first 50 bytes: {first_50.hex()}")
85
- log_debug(f"Audio last 50 bytes: {last_50.hex()}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
- # Ortalama amplitude kontrolü
88
- import struct
89
- samples = struct.unpack(f'{len(audio_data)//2}h', audio_data)
90
- avg_amplitude = sum(abs(s) for s in samples) / len(samples)
91
- max_amplitude = max(abs(s) for s in samples)
92
- log_debug(f"Audio stats: avg_amplitude={avg_amplitude:.1f}, max_amplitude={max_amplitude}")
93
-
94
- # Convert to WAV format for better compatibility
 
 
 
 
 
 
 
 
 
 
 
 
95
  wav_audio = self._convert_to_wav(audio_data, config.sample_rate)
96
 
97
  # Configure recognition
98
- language_code = self._map_language_code(config.language)
99
-
100
- """
101
- recognition_config = RecognitionConfig(
102
- encoding=RecognitionConfig.AudioEncoding.LINEAR16,
103
- sample_rate_hertz=config.sample_rate,
104
- language_code=language_code,
105
- enable_automatic_punctuation=config.enable_punctuation,
106
- model=config.model,
107
- use_enhanced=config.use_enhanced,
108
- enable_word_time_offsets=config.enable_word_timestamps,
109
- )
110
- """
111
-
112
  recognition_config = RecognitionConfig(
113
  encoding=RecognitionConfig.AudioEncoding.LINEAR16,
114
  sample_rate_hertz=16000,
115
  language_code="tr-TR",
116
- audio_channel_count=1, # Frontend mono audio gönderiyor
117
  enable_separate_recognition_per_channel=False,
 
118
  )
119
-
120
- log_debug(f"Recognition config: language={language_code}, sample_rate={config.sample_rate}, model={config.model}")
121
-
122
  # Create audio object
123
  audio = RecognitionAudio(content=wav_audio)
124
 
125
  # Perform synchronous recognition
126
  log_info(f"🔄 Sending audio to Google Cloud Speech API...")
127
  response = self.client.recognize(config=recognition_config, audio=audio)
128
-
129
- # ✅ Debug response
130
- log_debug(f"API Response: {response}")
131
-
 
 
 
 
 
 
 
 
132
  # Process results
133
- if response.results:
 
 
 
 
 
 
134
  result = response.results[0]
135
- if result.alternatives:
136
  alternative = result.alternatives[0]
137
 
138
- # Extract word timestamps if available
139
- word_timestamps = None
140
- if config.enable_word_timestamps and hasattr(alternative, 'words'):
141
- word_timestamps = [
142
- {
143
- "word": word_info.word,
144
- "start_time": word_info.start_time.total_seconds(),
145
- "end_time": word_info.end_time.total_seconds()
146
- }
147
- for word_info in alternative.words
148
- ]
149
-
150
  transcription = TranscriptionResult(
151
  text=alternative.transcript,
152
  confidence=alternative.confidence,
153
  timestamp=datetime.now().timestamp(),
154
- language=language_code,
155
- word_timestamps=word_timestamps
156
  )
157
 
158
- log_info(f"✅ Transcription: '{alternative.transcript}' (confidence: {alternative.confidence:.2f})")
159
  return transcription
160
 
161
- log_warning("⚠️ No transcription results")
162
  return None
163
 
164
  except Exception as e:
 
67
  return language_map.get(language, language)
68
 
69
  async def transcribe(self, audio_data: bytes, config: STTConfig) -> Optional[TranscriptionResult]:
 
70
  try:
 
71
  if not audio_data:
72
  log_warning("⚠️ No audio data provided")
73
  return None
74
 
75
  log_info(f"📊 Transcribing {len(audio_data)} bytes of audio")
76
 
77
+ # ✅ Detaylı audio analizi - logda
78
+ import struct
79
+ samples = struct.unpack(f'{len(audio_data)//2}h', audio_data)
80
+ total_samples = len(samples)
81
+
82
+ # 1. Genel istatistikler
83
+ non_zero_samples = [s for s in samples if s != 0]
84
+ zero_count = total_samples - len(non_zero_samples)
85
+
86
+ if non_zero_samples:
87
+ avg_amplitude = sum(abs(s) for s in non_zero_samples) / len(non_zero_samples)
88
+ max_amplitude = max(abs(s) for s in non_zero_samples)
89
+ else:
90
+ avg_amplitude = 0
91
+ max_amplitude = 0
92
+
93
+ log_info(f"🔍 Audio stats: {total_samples} total samples, {zero_count} zeros ({zero_count/total_samples:.1%})")
94
+ log_info(f"🔍 Non-zero stats: avg={avg_amplitude:.1f}, max={max_amplitude}")
95
+
96
+ # 2. Bölüm bazlı analiz (10 bölüme ayır)
97
+ section_size = total_samples // 10
98
+ log_info(f"🔍 Section analysis (each {section_size} samples):")
99
+
100
+ for i in range(10):
101
+ start_idx = i * section_size
102
+ end_idx = (i + 1) * section_size if i < 9 else total_samples
103
+ section = samples[start_idx:end_idx]
104
+
105
+ section_non_zero = [s for s in section if s != 0]
106
+ section_max = max(abs(s) for s in section_non_zero) if section_non_zero else 0
107
+ section_avg = sum(abs(s) for s in section_non_zero) / len(section_non_zero) if section_non_zero else 0
108
+ zero_ratio = (len(section) - len(section_non_zero)) / len(section)
109
+
110
+ log_info(f" Section {i+1}: max={section_max}, avg={section_avg:.1f}, zeros={zero_ratio:.1%}")
111
+
112
+ # 3. İlk konuşma başlangıcını bul
113
+ speech_threshold = 500 # RMS eşiği
114
+ speech_start_idx = -1
115
+
116
+ # 100 sample'lık pencerelerle RMS hesapla
117
+ window_size = 100
118
+ for i in range(0, total_samples - window_size, window_size):
119
+ window = samples[i:i + window_size]
120
+ rms = (sum(s * s for s in window) / window_size) ** 0.5
121
 
122
+ if rms > speech_threshold:
123
+ speech_start_idx = i
124
+ break
125
+
126
+ if speech_start_idx >= 0:
127
+ speech_start_time = speech_start_idx / config.sample_rate
128
+ log_info(f"🎤 Speech detected starting at sample {speech_start_idx} ({speech_start_time:.2f}s)")
129
+ else:
130
+ log_warning("⚠️ No speech detected above threshold in entire audio")
131
+
132
+ # 4. Audio'nun gerçekten boş olup olmadığını kontrol et
133
+ if max_amplitude < 100:
134
+ log_warning(f"⚠️ Audio appears silent: max_amplitude={max_amplitude}")
135
+ return None
136
+
137
+ if zero_count / total_samples > 0.95: # %95'den fazla sıfır
138
+ log_warning(f"⚠️ Audio is mostly zeros: {zero_count/total_samples:.1%}")
139
+ return None
140
+
141
+ # Convert to WAV format
142
  wav_audio = self._convert_to_wav(audio_data, config.sample_rate)
143
 
144
  # Configure recognition
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  recognition_config = RecognitionConfig(
146
  encoding=RecognitionConfig.AudioEncoding.LINEAR16,
147
  sample_rate_hertz=16000,
148
  language_code="tr-TR",
149
+ audio_channel_count=1,
150
  enable_separate_recognition_per_channel=False,
151
+ enable_automatic_punctuation=True,
152
  )
153
+
 
 
154
  # Create audio object
155
  audio = RecognitionAudio(content=wav_audio)
156
 
157
  # Perform synchronous recognition
158
  log_info(f"🔄 Sending audio to Google Cloud Speech API...")
159
  response = self.client.recognize(config=recognition_config, audio=audio)
160
+
161
+ # ✅ Detaylı response analizi
162
+ log_info(f"🔍 Google response details:")
163
+ log_info(f" - Has results: {bool(response.results)}")
164
+ log_info(f" - Results count: {len(response.results) if response.results else 0}")
165
+
166
+ if hasattr(response, 'total_billed_time'):
167
+ if response.total_billed_time and response.total_billed_time.total_seconds() > 0:
168
+ log_info(f" - Billed time: {response.total_billed_time.total_seconds()}s")
169
+ else:
170
+ log_info(f" - Billed time: 0s (no audio processed)")
171
+
172
  # Process results
173
+ if response.results and len(response.results) > 0:
174
+ for i, result in enumerate(response.results):
175
+ log_info(f" - Result {i}: {len(result.alternatives)} alternatives")
176
+ if result.alternatives:
177
+ for j, alt in enumerate(result.alternatives):
178
+ log_info(f" - Alt {j}: '{alt.transcript}' (conf: {alt.confidence:.3f})")
179
+
180
  result = response.results[0]
181
+ if result.alternatives and len(result.alternatives) > 0:
182
  alternative = result.alternatives[0]
183
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  transcription = TranscriptionResult(
185
  text=alternative.transcript,
186
  confidence=alternative.confidence,
187
  timestamp=datetime.now().timestamp(),
188
+ language="tr-TR",
189
+ word_timestamps=None
190
  )
191
 
192
+ log_info(f"✅ Transcription SUCCESS: '{alternative.transcript}' (confidence: {alternative.confidence:.2f})")
193
  return transcription
194
 
195
+ log_warning("⚠️ No transcription results - Google couldn't recognize speech")
196
  return None
197
 
198
  except Exception as e: