ciyidogan commited on
Commit
9c58077
·
verified ·
1 Parent(s): 308dbba

Update stt/stt_google.py

Browse files
Files changed (1) hide show
  1. stt/stt_google.py +170 -83
stt/stt_google.py CHANGED
@@ -5,6 +5,7 @@ from typing import Optional, List
5
  from datetime import datetime
6
  import io
7
  import wave
 
8
  from google.cloud import speech
9
  from google.cloud.speech import RecognitionConfig, RecognitionAudio
10
  from utils.logger import log_info, log_error, log_debug, log_warning
@@ -65,7 +66,66 @@ class GoogleSTT(STTInterface):
65
 
66
  # Default to the language itself if not in map
67
  return language_map.get(language, language)
68
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  async def transcribe(self, audio_data: bytes, config: STTConfig) -> Optional[TranscriptionResult]:
70
  """Transcribe audio data using Google Cloud Speech API"""
71
  try:
@@ -76,85 +136,99 @@ class GoogleSTT(STTInterface):
76
 
77
  log_info(f"📊 Transcribing {len(audio_data)} bytes of audio")
78
 
79
- # ✅ Debug - audio verisi analizi
80
- if len(audio_data) > 100:
81
- # İlk ve son 50 byte'ı kontrol et
82
- first_50 = audio_data[:50]
83
- last_50 = audio_data[-50:]
84
- log_debug(f"Audio first 50 bytes: {first_50.hex()}")
85
- log_debug(f"Audio last 50 bytes: {last_50.hex()}")
86
-
87
- # Ortalama amplitude kontrolü
88
- import struct
89
- samples = struct.unpack(f'{len(audio_data)//2}h', audio_data)
90
- avg_amplitude = sum(abs(s) for s in samples) / len(samples)
91
- max_amplitude = max(abs(s) for s in samples)
92
- log_debug(f"Audio stats: avg_amplitude={avg_amplitude:.1f}, max_amplitude={max_amplitude}")
93
-
94
- # ✅ Convert to WAV format for better compatibility
95
- wav_audio = self._convert_to_wav(audio_data, config.sample_rate)
96
  log_info(f"🔧 WAV conversion: {len(audio_data)} PCM → {len(wav_audio)} WAV")
97
 
98
  # Configure recognition
99
  language_code = self._map_language_code(config.language)
100
 
101
- # ✅ WAV audio kullanıyoruz artık
102
  recognition_config = RecognitionConfig(
103
  encoding=RecognitionConfig.AudioEncoding.LINEAR16,
104
- sample_rate_hertz=16000,
105
- language_code="tr-TR",
106
- audio_channel_count=1, # Frontend mono audio gönderiyor
107
  enable_separate_recognition_per_channel=False,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  )
109
 
110
- log_debug(f"Recognition config: language={language_code}, sample_rate={config.sample_rate}, model={config.model}")
111
 
112
- # ✅ Create audio object with WAV data (not raw PCM)
113
- audio = RecognitionAudio(content=wav_audio) # wav_audio kullan, audio_data değil
114
 
115
  # Perform synchronous recognition
116
  log_info(f"🔄 Sending {len(wav_audio)} bytes WAV to Google Cloud Speech API...")
117
  response = self.client.recognize(config=recognition_config, audio=audio)
118
 
119
- # ✅ Debug response
120
  log_debug(f"API Response: {response}")
121
  log_info(f"🔍 Google response details:")
122
  log_info(f"- Has results: {bool(response.results)}")
123
  log_info(f"- Results count: {len(response.results)}")
124
 
 
 
 
 
125
  if hasattr(response, 'total_billed_time'):
126
- log_info(f"- Billed time: {response.total_billed_time.total_seconds()}s")
 
 
 
 
 
 
127
  else:
128
  log_info(f"- Billed time: 0s (no audio processed)")
129
 
130
  # Process results
131
  if response.results:
132
- result = response.results[0]
133
- if result.alternatives:
134
- alternative = result.alternatives[0]
135
-
136
- # Extract word timestamps if available
137
- word_timestamps = None
138
- if config.enable_word_timestamps and hasattr(alternative, 'words'):
139
- word_timestamps = [
140
- {
141
- "word": word_info.word,
142
- "start_time": word_info.start_time.total_seconds(),
143
- "end_time": word_info.end_time.total_seconds()
144
- }
145
- for word_info in alternative.words
146
- ]
147
 
148
- transcription = TranscriptionResult(
149
- text=alternative.transcript,
150
- confidence=alternative.confidence,
151
- timestamp=datetime.now().timestamp(),
152
- language=language_code,
153
- word_timestamps=word_timestamps
154
- )
155
-
156
- log_info(f"✅ Transcription: '{alternative.transcript}' (confidence: {alternative.confidence:.2f})")
157
- return transcription
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
 
159
  log_warning("⚠️ No transcription results - Google couldn't recognize speech")
160
  return None
@@ -165,44 +239,57 @@ class GoogleSTT(STTInterface):
165
  log_error(f"Traceback: {traceback.format_exc()}")
166
  return None
167
 
168
- def _convert_to_wav(self, audio_data: bytes, sample_rate: int) -> bytes:
169
- """Convert raw PCM to proper WAV format"""
170
  try:
171
- import struct
172
-
173
- # WAV file parameters
174
- channels = 1
175
- sample_width = 2 # 16-bit
176
- frame_rate = sample_rate
177
- audio_length = len(audio_data)
178
-
179
- # Create proper WAV header
180
- wav_header = struct.pack('<4sI4s4sIHHIIHH4sI',
181
- b'RIFF', # ChunkID
182
- 36 + audio_length, # ChunkSize
183
- b'WAVE', # Format
184
- b'fmt ', # Subchunk1ID
185
- 16, # Subchunk1Size (PCM)
186
- 1, # AudioFormat (PCM = 1)
187
- channels, # NumChannels
188
- frame_rate, # SampleRate
189
- frame_rate * channels * sample_width, # ByteRate
190
- channels * sample_width, # BlockAlign
191
- sample_width * 8, # BitsPerSample
192
- b'data', # Subchunk2ID
193
- audio_length # Subchunk2Size
194
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
 
196
- # Combine header and audio data
197
- wav_data = wav_header + audio_data
198
 
199
- log_info(f"🔧 WAV conversion: {len(audio_data)} PCM → {len(wav_data)} WAV")
200
- log_info(f"🔧 WAV specs: {channels}ch, {frame_rate}Hz, {sample_width*8}bit")
201
 
202
  return wav_data
203
 
204
  except Exception as e:
205
- log_error(f"WAV conversion failed: {e}")
206
  # Fallback to raw PCM
207
  return audio_data
208
 
 
5
  from datetime import datetime
6
  import io
7
  import wave
8
+ import struct
9
  from google.cloud import speech
10
  from google.cloud.speech import RecognitionConfig, RecognitionAudio
11
  from utils.logger import log_info, log_error, log_debug, log_warning
 
66
 
67
  # Default to the language itself if not in map
68
  return language_map.get(language, language)
69
+
70
+ def _analyze_audio_content(self, audio_data: bytes):
71
+ """Analyze audio content for debugging"""
72
+ try:
73
+ if len(audio_data) < 100:
74
+ log_warning(f"⚠️ Very short audio data: {len(audio_data)} bytes")
75
+ return
76
+
77
+ # Convert to samples
78
+ samples = struct.unpack(f'{len(audio_data)//2}h', audio_data)
79
+ total_samples = len(samples)
80
+
81
+ # Basic stats
82
+ non_zero_samples = [s for s in samples if s != 0]
83
+ zero_count = total_samples - len(non_zero_samples)
84
+ zero_percentage = (zero_count / total_samples) * 100
85
+
86
+ log_info(f"🔍 Audio stats: {total_samples} total samples, {zero_count} zeros ({zero_percentage:.1f}%)")
87
+
88
+ if non_zero_samples:
89
+ avg_amplitude = sum(abs(s) for s in non_zero_samples) / len(non_zero_samples)
90
+ max_amplitude = max(abs(s) for s in non_zero_samples)
91
+ log_info(f"🔍 Non-zero stats: avg={avg_amplitude:.1f}, max={max_amplitude}")
92
+
93
+ # Section analysis
94
+ section_size = total_samples // 10
95
+ log_info(f"🔍 Section analysis (each {section_size} samples):")
96
+
97
+ for i in range(10):
98
+ start = i * section_size
99
+ end = min((i + 1) * section_size, total_samples)
100
+ section = samples[start:end]
101
+
102
+ section_non_zero = [s for s in section if s != 0]
103
+ section_zeros = len(section) - len(section_non_zero)
104
+ section_zero_pct = (section_zeros / len(section)) * 100
105
+
106
+ if section_non_zero:
107
+ section_max = max(abs(s) for s in section_non_zero)
108
+ section_avg = sum(abs(s) for s in section_non_zero) / len(section_non_zero)
109
+ log_info(f"Section {i+1}: max={section_max}, avg={section_avg:.1f}, zeros={section_zero_pct:.1f}%")
110
+
111
+ # Find where speech starts (first significant activity)
112
+ speech_threshold = 1000 # Minimum amplitude to consider as speech
113
+ speech_start = None
114
+ for i, sample in enumerate(samples):
115
+ if abs(sample) > speech_threshold:
116
+ speech_start = i
117
+ break
118
+
119
+ if speech_start is not None:
120
+ log_info(f"🎤 Speech detected starting at sample {speech_start} ({speech_start/16000:.2f}s)")
121
+ else:
122
+ log_warning(f"⚠️ No clear speech signal detected (threshold: {speech_threshold})")
123
+ else:
124
+ log_warning(f"⚠️ All samples are zero - no audio content")
125
+
126
+ except Exception as e:
127
+ log_error(f"❌ Error analyzing audio: {e}")
128
+
129
  async def transcribe(self, audio_data: bytes, config: STTConfig) -> Optional[TranscriptionResult]:
130
  """Transcribe audio data using Google Cloud Speech API"""
131
  try:
 
136
 
137
  log_info(f"📊 Transcribing {len(audio_data)} bytes of audio")
138
 
139
+ # ✅ Audio analizi
140
+ self._analyze_audio_content(audio_data)
141
+
142
+ # WAV formatında gönder - Google bu formatı daha iyi tanıyor
143
+ wav_audio = self._convert_to_wav_proper(audio_data, config.sample_rate)
 
 
 
 
 
 
 
 
 
 
 
 
144
  log_info(f"🔧 WAV conversion: {len(audio_data)} PCM → {len(wav_audio)} WAV")
145
 
146
  # Configure recognition
147
  language_code = self._map_language_code(config.language)
148
 
149
+ # ✅ WAV formatı için doğru config
150
  recognition_config = RecognitionConfig(
151
  encoding=RecognitionConfig.AudioEncoding.LINEAR16,
152
+ sample_rate_hertz=config.sample_rate,
153
+ language_code=language_code,
154
+ audio_channel_count=1,
155
  enable_separate_recognition_per_channel=False,
156
+ # ✅ Enhanced model kullan
157
+ model="latest_long",
158
+ use_enhanced=True,
159
+ # ✅ Punctuation ekle
160
+ enable_automatic_punctuation=config.enable_punctuation if hasattr(config, 'enable_punctuation') else True,
161
+ # ✅ Profanity filter'ı kapat (daha iyi tanıma için)
162
+ profanity_filter=False,
163
+ # ✅ Audio analizi için metadata
164
+ metadata=speech.RecognitionMetadata(
165
+ interaction_type=speech.RecognitionMetadata.InteractionType.VOICE_COMMAND,
166
+ microphone_distance=speech.RecognitionMetadata.MicrophoneDistance.NEARFIELD,
167
+ original_media_type=speech.RecognitionMetadata.OriginalMediaType.AUDIO,
168
+ recording_device_type=speech.RecognitionMetadata.RecordingDeviceType.PC
169
+ )
170
  )
171
 
172
+ log_debug(f"Recognition config: language={language_code}, sample_rate={config.sample_rate}, model=latest_long")
173
 
174
+ # ✅ Create audio object with WAV data
175
+ audio = RecognitionAudio(content=wav_audio)
176
 
177
  # Perform synchronous recognition
178
  log_info(f"🔄 Sending {len(wav_audio)} bytes WAV to Google Cloud Speech API...")
179
  response = self.client.recognize(config=recognition_config, audio=audio)
180
 
181
+ # ✅ Detaylı response analizi
182
  log_debug(f"API Response: {response}")
183
  log_info(f"🔍 Google response details:")
184
  log_info(f"- Has results: {bool(response.results)}")
185
  log_info(f"- Results count: {len(response.results)}")
186
 
187
+ # ✅ Request ID'yi logla
188
+ if hasattr(response, '_pb') and hasattr(response._pb, 'request_id'):
189
+ log_info(f"- Request ID: {response._pb.request_id}")
190
+
191
  if hasattr(response, 'total_billed_time'):
192
+ billed_seconds = response.total_billed_time.total_seconds()
193
+ log_info(f"- Billed time: {billed_seconds}s")
194
+
195
+ # ✅ Eğer billed time 0 ise, Google hiç audio işlememiş demektir
196
+ if billed_seconds == 0:
197
+ log_error("❌ Google didn't process any audio - possible format issue")
198
+ return None
199
  else:
200
  log_info(f"- Billed time: 0s (no audio processed)")
201
 
202
  # Process results
203
  if response.results:
204
+ for i, result in enumerate(response.results):
205
+ log_debug(f"Result {i}: {result}")
 
 
 
 
 
 
 
 
 
 
 
 
 
206
 
207
+ if result.alternatives:
208
+ alternative = result.alternatives[0]
209
+
210
+ # Extract word timestamps if available
211
+ word_timestamps = None
212
+ if config.enable_word_timestamps and hasattr(alternative, 'words'):
213
+ word_timestamps = [
214
+ {
215
+ "word": word_info.word,
216
+ "start_time": word_info.start_time.total_seconds(),
217
+ "end_time": word_info.end_time.total_seconds()
218
+ }
219
+ for word_info in alternative.words
220
+ ]
221
+
222
+ transcription = TranscriptionResult(
223
+ text=alternative.transcript,
224
+ confidence=alternative.confidence,
225
+ timestamp=datetime.now().timestamp(),
226
+ language=language_code,
227
+ word_timestamps=word_timestamps
228
+ )
229
+
230
+ log_info(f"✅ Transcription: '{alternative.transcript}' (confidence: {alternative.confidence:.2f})")
231
+ return transcription
232
 
233
  log_warning("⚠️ No transcription results - Google couldn't recognize speech")
234
  return None
 
239
  log_error(f"Traceback: {traceback.format_exc()}")
240
  return None
241
 
242
+ def _convert_to_wav_proper(self, audio_data: bytes, sample_rate: int) -> bytes:
243
+ """Convert raw PCM to proper WAV format - EXACTLY like test code"""
244
  try:
245
+ # ✅ Test kodundan aynı WAV header oluşturma
246
+ length = len(audio_data)
247
+ buffer_size = 44 + length # WAV header + data
248
+
249
+ # BytesIO kullanarak memory'de WAV oluştur
250
+ wav_buffer = io.BytesIO()
251
+
252
+ # ✅ Test kodundan aynı header yazma
253
+ def write_string(data: str):
254
+ wav_buffer.write(data.encode('ascii'))
255
+
256
+ def write_uint32(value: int):
257
+ wav_buffer.write(struct.pack('<I', value))
258
+
259
+ def write_uint16(value: int):
260
+ wav_buffer.write(struct.pack('<H', value))
261
+
262
+ # RIFF header
263
+ write_string('RIFF')
264
+ write_uint32(36 + length) # File size - 8
265
+ write_string('WAVE')
266
+
267
+ # fmt chunk
268
+ write_string('fmt ')
269
+ write_uint32(16) # Subchunk1Size (PCM)
270
+ write_uint16(1) # AudioFormat (PCM = 1)
271
+ write_uint16(1) # NumChannels (mono)
272
+ write_uint32(sample_rate) # SampleRate
273
+ write_uint32(sample_rate * 1 * 2) # ByteRate
274
+ write_uint16(1 * 2) # BlockAlign
275
+ write_uint16(16) # BitsPerSample
276
+
277
+ # data chunk
278
+ write_string('data')
279
+ write_uint32(length) # Subchunk2Size
280
+
281
+ # Audio data
282
+ wav_buffer.write(audio_data)
283
 
284
+ wav_data = wav_buffer.getvalue()
285
+ wav_buffer.close()
286
 
287
+ log_info(f"🔧 WAV specs: 1ch, {sample_rate}Hz, 16bit")
 
288
 
289
  return wav_data
290
 
291
  except Exception as e:
292
+ log_error(f"WAV conversion failed: {e}")
293
  # Fallback to raw PCM
294
  return audio_data
295