Spaces:
Building
Building
Update stt/stt_google.py
Browse files- stt/stt_google.py +170 -83
stt/stt_google.py
CHANGED
@@ -5,6 +5,7 @@ from typing import Optional, List
|
|
5 |
from datetime import datetime
|
6 |
import io
|
7 |
import wave
|
|
|
8 |
from google.cloud import speech
|
9 |
from google.cloud.speech import RecognitionConfig, RecognitionAudio
|
10 |
from utils.logger import log_info, log_error, log_debug, log_warning
|
@@ -65,7 +66,66 @@ class GoogleSTT(STTInterface):
|
|
65 |
|
66 |
# Default to the language itself if not in map
|
67 |
return language_map.get(language, language)
|
68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
async def transcribe(self, audio_data: bytes, config: STTConfig) -> Optional[TranscriptionResult]:
|
70 |
"""Transcribe audio data using Google Cloud Speech API"""
|
71 |
try:
|
@@ -76,85 +136,99 @@ class GoogleSTT(STTInterface):
|
|
76 |
|
77 |
log_info(f"📊 Transcribing {len(audio_data)} bytes of audio")
|
78 |
|
79 |
-
# ✅
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
log_debug(f"Audio first 50 bytes: {first_50.hex()}")
|
85 |
-
log_debug(f"Audio last 50 bytes: {last_50.hex()}")
|
86 |
-
|
87 |
-
# Ortalama amplitude kontrolü
|
88 |
-
import struct
|
89 |
-
samples = struct.unpack(f'{len(audio_data)//2}h', audio_data)
|
90 |
-
avg_amplitude = sum(abs(s) for s in samples) / len(samples)
|
91 |
-
max_amplitude = max(abs(s) for s in samples)
|
92 |
-
log_debug(f"Audio stats: avg_amplitude={avg_amplitude:.1f}, max_amplitude={max_amplitude}")
|
93 |
-
|
94 |
-
# ✅ Convert to WAV format for better compatibility
|
95 |
-
wav_audio = self._convert_to_wav(audio_data, config.sample_rate)
|
96 |
log_info(f"🔧 WAV conversion: {len(audio_data)} PCM → {len(wav_audio)} WAV")
|
97 |
|
98 |
# Configure recognition
|
99 |
language_code = self._map_language_code(config.language)
|
100 |
|
101 |
-
# ✅ WAV
|
102 |
recognition_config = RecognitionConfig(
|
103 |
encoding=RecognitionConfig.AudioEncoding.LINEAR16,
|
104 |
-
sample_rate_hertz=
|
105 |
-
language_code=
|
106 |
-
audio_channel_count=1,
|
107 |
enable_separate_recognition_per_channel=False,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
)
|
109 |
|
110 |
-
log_debug(f"Recognition config: language={language_code}, sample_rate={config.sample_rate}, model=
|
111 |
|
112 |
-
# ✅ Create audio object with WAV data
|
113 |
-
audio = RecognitionAudio(content=wav_audio)
|
114 |
|
115 |
# Perform synchronous recognition
|
116 |
log_info(f"🔄 Sending {len(wav_audio)} bytes WAV to Google Cloud Speech API...")
|
117 |
response = self.client.recognize(config=recognition_config, audio=audio)
|
118 |
|
119 |
-
# ✅
|
120 |
log_debug(f"API Response: {response}")
|
121 |
log_info(f"🔍 Google response details:")
|
122 |
log_info(f"- Has results: {bool(response.results)}")
|
123 |
log_info(f"- Results count: {len(response.results)}")
|
124 |
|
|
|
|
|
|
|
|
|
125 |
if hasattr(response, 'total_billed_time'):
|
126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
else:
|
128 |
log_info(f"- Billed time: 0s (no audio processed)")
|
129 |
|
130 |
# Process results
|
131 |
if response.results:
|
132 |
-
result
|
133 |
-
|
134 |
-
alternative = result.alternatives[0]
|
135 |
-
|
136 |
-
# Extract word timestamps if available
|
137 |
-
word_timestamps = None
|
138 |
-
if config.enable_word_timestamps and hasattr(alternative, 'words'):
|
139 |
-
word_timestamps = [
|
140 |
-
{
|
141 |
-
"word": word_info.word,
|
142 |
-
"start_time": word_info.start_time.total_seconds(),
|
143 |
-
"end_time": word_info.end_time.total_seconds()
|
144 |
-
}
|
145 |
-
for word_info in alternative.words
|
146 |
-
]
|
147 |
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
|
159 |
log_warning("⚠️ No transcription results - Google couldn't recognize speech")
|
160 |
return None
|
@@ -165,44 +239,57 @@ class GoogleSTT(STTInterface):
|
|
165 |
log_error(f"Traceback: {traceback.format_exc()}")
|
166 |
return None
|
167 |
|
168 |
-
def
|
169 |
-
"""Convert raw PCM to proper WAV format"""
|
170 |
try:
|
171 |
-
|
172 |
-
|
173 |
-
# WAV
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
195 |
|
196 |
-
|
197 |
-
|
198 |
|
199 |
-
log_info(f"🔧 WAV
|
200 |
-
log_info(f"🔧 WAV specs: {channels}ch, {frame_rate}Hz, {sample_width*8}bit")
|
201 |
|
202 |
return wav_data
|
203 |
|
204 |
except Exception as e:
|
205 |
-
log_error(f"WAV conversion failed: {e}")
|
206 |
# Fallback to raw PCM
|
207 |
return audio_data
|
208 |
|
|
|
5 |
from datetime import datetime
|
6 |
import io
|
7 |
import wave
|
8 |
+
import struct
|
9 |
from google.cloud import speech
|
10 |
from google.cloud.speech import RecognitionConfig, RecognitionAudio
|
11 |
from utils.logger import log_info, log_error, log_debug, log_warning
|
|
|
66 |
|
67 |
# Default to the language itself if not in map
|
68 |
return language_map.get(language, language)
|
69 |
+
|
70 |
+
def _analyze_audio_content(self, audio_data: bytes):
|
71 |
+
"""Analyze audio content for debugging"""
|
72 |
+
try:
|
73 |
+
if len(audio_data) < 100:
|
74 |
+
log_warning(f"⚠️ Very short audio data: {len(audio_data)} bytes")
|
75 |
+
return
|
76 |
+
|
77 |
+
# Convert to samples
|
78 |
+
samples = struct.unpack(f'{len(audio_data)//2}h', audio_data)
|
79 |
+
total_samples = len(samples)
|
80 |
+
|
81 |
+
# Basic stats
|
82 |
+
non_zero_samples = [s for s in samples if s != 0]
|
83 |
+
zero_count = total_samples - len(non_zero_samples)
|
84 |
+
zero_percentage = (zero_count / total_samples) * 100
|
85 |
+
|
86 |
+
log_info(f"🔍 Audio stats: {total_samples} total samples, {zero_count} zeros ({zero_percentage:.1f}%)")
|
87 |
+
|
88 |
+
if non_zero_samples:
|
89 |
+
avg_amplitude = sum(abs(s) for s in non_zero_samples) / len(non_zero_samples)
|
90 |
+
max_amplitude = max(abs(s) for s in non_zero_samples)
|
91 |
+
log_info(f"🔍 Non-zero stats: avg={avg_amplitude:.1f}, max={max_amplitude}")
|
92 |
+
|
93 |
+
# Section analysis
|
94 |
+
section_size = total_samples // 10
|
95 |
+
log_info(f"🔍 Section analysis (each {section_size} samples):")
|
96 |
+
|
97 |
+
for i in range(10):
|
98 |
+
start = i * section_size
|
99 |
+
end = min((i + 1) * section_size, total_samples)
|
100 |
+
section = samples[start:end]
|
101 |
+
|
102 |
+
section_non_zero = [s for s in section if s != 0]
|
103 |
+
section_zeros = len(section) - len(section_non_zero)
|
104 |
+
section_zero_pct = (section_zeros / len(section)) * 100
|
105 |
+
|
106 |
+
if section_non_zero:
|
107 |
+
section_max = max(abs(s) for s in section_non_zero)
|
108 |
+
section_avg = sum(abs(s) for s in section_non_zero) / len(section_non_zero)
|
109 |
+
log_info(f"Section {i+1}: max={section_max}, avg={section_avg:.1f}, zeros={section_zero_pct:.1f}%")
|
110 |
+
|
111 |
+
# Find where speech starts (first significant activity)
|
112 |
+
speech_threshold = 1000 # Minimum amplitude to consider as speech
|
113 |
+
speech_start = None
|
114 |
+
for i, sample in enumerate(samples):
|
115 |
+
if abs(sample) > speech_threshold:
|
116 |
+
speech_start = i
|
117 |
+
break
|
118 |
+
|
119 |
+
if speech_start is not None:
|
120 |
+
log_info(f"🎤 Speech detected starting at sample {speech_start} ({speech_start/16000:.2f}s)")
|
121 |
+
else:
|
122 |
+
log_warning(f"⚠️ No clear speech signal detected (threshold: {speech_threshold})")
|
123 |
+
else:
|
124 |
+
log_warning(f"⚠️ All samples are zero - no audio content")
|
125 |
+
|
126 |
+
except Exception as e:
|
127 |
+
log_error(f"❌ Error analyzing audio: {e}")
|
128 |
+
|
129 |
async def transcribe(self, audio_data: bytes, config: STTConfig) -> Optional[TranscriptionResult]:
|
130 |
"""Transcribe audio data using Google Cloud Speech API"""
|
131 |
try:
|
|
|
136 |
|
137 |
log_info(f"📊 Transcribing {len(audio_data)} bytes of audio")
|
138 |
|
139 |
+
# ✅ Audio analizi
|
140 |
+
self._analyze_audio_content(audio_data)
|
141 |
+
|
142 |
+
# ✅ WAV formatında gönder - Google bu formatı daha iyi tanıyor
|
143 |
+
wav_audio = self._convert_to_wav_proper(audio_data, config.sample_rate)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
log_info(f"🔧 WAV conversion: {len(audio_data)} PCM → {len(wav_audio)} WAV")
|
145 |
|
146 |
# Configure recognition
|
147 |
language_code = self._map_language_code(config.language)
|
148 |
|
149 |
+
# ✅ WAV formatı için doğru config
|
150 |
recognition_config = RecognitionConfig(
|
151 |
encoding=RecognitionConfig.AudioEncoding.LINEAR16,
|
152 |
+
sample_rate_hertz=config.sample_rate,
|
153 |
+
language_code=language_code,
|
154 |
+
audio_channel_count=1,
|
155 |
enable_separate_recognition_per_channel=False,
|
156 |
+
# ✅ Enhanced model kullan
|
157 |
+
model="latest_long",
|
158 |
+
use_enhanced=True,
|
159 |
+
# ✅ Punctuation ekle
|
160 |
+
enable_automatic_punctuation=config.enable_punctuation if hasattr(config, 'enable_punctuation') else True,
|
161 |
+
# ✅ Profanity filter'ı kapat (daha iyi tanıma için)
|
162 |
+
profanity_filter=False,
|
163 |
+
# ✅ Audio analizi için metadata
|
164 |
+
metadata=speech.RecognitionMetadata(
|
165 |
+
interaction_type=speech.RecognitionMetadata.InteractionType.VOICE_COMMAND,
|
166 |
+
microphone_distance=speech.RecognitionMetadata.MicrophoneDistance.NEARFIELD,
|
167 |
+
original_media_type=speech.RecognitionMetadata.OriginalMediaType.AUDIO,
|
168 |
+
recording_device_type=speech.RecognitionMetadata.RecordingDeviceType.PC
|
169 |
+
)
|
170 |
)
|
171 |
|
172 |
+
log_debug(f"Recognition config: language={language_code}, sample_rate={config.sample_rate}, model=latest_long")
|
173 |
|
174 |
+
# ✅ Create audio object with WAV data
|
175 |
+
audio = RecognitionAudio(content=wav_audio)
|
176 |
|
177 |
# Perform synchronous recognition
|
178 |
log_info(f"🔄 Sending {len(wav_audio)} bytes WAV to Google Cloud Speech API...")
|
179 |
response = self.client.recognize(config=recognition_config, audio=audio)
|
180 |
|
181 |
+
# ✅ Detaylı response analizi
|
182 |
log_debug(f"API Response: {response}")
|
183 |
log_info(f"🔍 Google response details:")
|
184 |
log_info(f"- Has results: {bool(response.results)}")
|
185 |
log_info(f"- Results count: {len(response.results)}")
|
186 |
|
187 |
+
# ✅ Request ID'yi logla
|
188 |
+
if hasattr(response, '_pb') and hasattr(response._pb, 'request_id'):
|
189 |
+
log_info(f"- Request ID: {response._pb.request_id}")
|
190 |
+
|
191 |
if hasattr(response, 'total_billed_time'):
|
192 |
+
billed_seconds = response.total_billed_time.total_seconds()
|
193 |
+
log_info(f"- Billed time: {billed_seconds}s")
|
194 |
+
|
195 |
+
# ✅ Eğer billed time 0 ise, Google hiç audio işlememiş demektir
|
196 |
+
if billed_seconds == 0:
|
197 |
+
log_error("❌ Google didn't process any audio - possible format issue")
|
198 |
+
return None
|
199 |
else:
|
200 |
log_info(f"- Billed time: 0s (no audio processed)")
|
201 |
|
202 |
# Process results
|
203 |
if response.results:
|
204 |
+
for i, result in enumerate(response.results):
|
205 |
+
log_debug(f"Result {i}: {result}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
206 |
|
207 |
+
if result.alternatives:
|
208 |
+
alternative = result.alternatives[0]
|
209 |
+
|
210 |
+
# Extract word timestamps if available
|
211 |
+
word_timestamps = None
|
212 |
+
if config.enable_word_timestamps and hasattr(alternative, 'words'):
|
213 |
+
word_timestamps = [
|
214 |
+
{
|
215 |
+
"word": word_info.word,
|
216 |
+
"start_time": word_info.start_time.total_seconds(),
|
217 |
+
"end_time": word_info.end_time.total_seconds()
|
218 |
+
}
|
219 |
+
for word_info in alternative.words
|
220 |
+
]
|
221 |
+
|
222 |
+
transcription = TranscriptionResult(
|
223 |
+
text=alternative.transcript,
|
224 |
+
confidence=alternative.confidence,
|
225 |
+
timestamp=datetime.now().timestamp(),
|
226 |
+
language=language_code,
|
227 |
+
word_timestamps=word_timestamps
|
228 |
+
)
|
229 |
+
|
230 |
+
log_info(f"✅ Transcription: '{alternative.transcript}' (confidence: {alternative.confidence:.2f})")
|
231 |
+
return transcription
|
232 |
|
233 |
log_warning("⚠️ No transcription results - Google couldn't recognize speech")
|
234 |
return None
|
|
|
239 |
log_error(f"Traceback: {traceback.format_exc()}")
|
240 |
return None
|
241 |
|
242 |
+
def _convert_to_wav_proper(self, audio_data: bytes, sample_rate: int) -> bytes:
|
243 |
+
"""Convert raw PCM to proper WAV format - EXACTLY like test code"""
|
244 |
try:
|
245 |
+
# ✅ Test kodundan aynı WAV header oluşturma
|
246 |
+
length = len(audio_data)
|
247 |
+
buffer_size = 44 + length # WAV header + data
|
248 |
+
|
249 |
+
# ✅ BytesIO kullanarak memory'de WAV oluştur
|
250 |
+
wav_buffer = io.BytesIO()
|
251 |
+
|
252 |
+
# ✅ Test kodundan aynı header yazma
|
253 |
+
def write_string(data: str):
|
254 |
+
wav_buffer.write(data.encode('ascii'))
|
255 |
+
|
256 |
+
def write_uint32(value: int):
|
257 |
+
wav_buffer.write(struct.pack('<I', value))
|
258 |
+
|
259 |
+
def write_uint16(value: int):
|
260 |
+
wav_buffer.write(struct.pack('<H', value))
|
261 |
+
|
262 |
+
# RIFF header
|
263 |
+
write_string('RIFF')
|
264 |
+
write_uint32(36 + length) # File size - 8
|
265 |
+
write_string('WAVE')
|
266 |
+
|
267 |
+
# fmt chunk
|
268 |
+
write_string('fmt ')
|
269 |
+
write_uint32(16) # Subchunk1Size (PCM)
|
270 |
+
write_uint16(1) # AudioFormat (PCM = 1)
|
271 |
+
write_uint16(1) # NumChannels (mono)
|
272 |
+
write_uint32(sample_rate) # SampleRate
|
273 |
+
write_uint32(sample_rate * 1 * 2) # ByteRate
|
274 |
+
write_uint16(1 * 2) # BlockAlign
|
275 |
+
write_uint16(16) # BitsPerSample
|
276 |
+
|
277 |
+
# data chunk
|
278 |
+
write_string('data')
|
279 |
+
write_uint32(length) # Subchunk2Size
|
280 |
+
|
281 |
+
# Audio data
|
282 |
+
wav_buffer.write(audio_data)
|
283 |
|
284 |
+
wav_data = wav_buffer.getvalue()
|
285 |
+
wav_buffer.close()
|
286 |
|
287 |
+
log_info(f"🔧 WAV specs: 1ch, {sample_rate}Hz, 16bit")
|
|
|
288 |
|
289 |
return wav_data
|
290 |
|
291 |
except Exception as e:
|
292 |
+
log_error(f"❌ WAV conversion failed: {e}")
|
293 |
# Fallback to raw PCM
|
294 |
return audio_data
|
295 |
|