Spaces:
Building
Building
Update stt/stt_google.py
Browse files- stt/stt_google.py +45 -15
stt/stt_google.py
CHANGED
@@ -140,7 +140,9 @@ class GoogleSTT(STTInterface):
|
|
140 |
log_warning(f"⚠️ Audio is mostly zeros: {zero_count/total_samples:.1%}")
|
141 |
return None
|
142 |
|
143 |
-
|
|
|
|
|
144 |
recognition_config = RecognitionConfig(
|
145 |
encoding=RecognitionConfig.AudioEncoding.LINEAR16,
|
146 |
sample_rate_hertz=16000,
|
@@ -150,6 +152,9 @@ class GoogleSTT(STTInterface):
|
|
150 |
enable_automatic_punctuation=True,
|
151 |
)
|
152 |
|
|
|
|
|
|
|
153 |
# ✅ RAW audio gönder, WAV conversion yapmadan
|
154 |
audio = RecognitionAudio(content=audio_data) # Direkt raw PCM
|
155 |
|
@@ -201,20 +206,45 @@ class GoogleSTT(STTInterface):
|
|
201 |
return None
|
202 |
|
203 |
def _convert_to_wav(self, audio_data: bytes, sample_rate: int) -> bytes:
|
204 |
-
"""Convert raw PCM
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
218 |
|
219 |
def get_supported_languages(self) -> List[str]:
|
220 |
"""Get list of supported language codes"""
|
|
|
140 |
log_warning(f"⚠️ Audio is mostly zeros: {zero_count/total_samples:.1%}")
|
141 |
return None
|
142 |
|
143 |
+
wav_audio = self._convert_to_wav(audio_data, 16000)
|
144 |
+
|
145 |
+
# Configure recognition
|
146 |
recognition_config = RecognitionConfig(
|
147 |
encoding=RecognitionConfig.AudioEncoding.LINEAR16,
|
148 |
sample_rate_hertz=16000,
|
|
|
152 |
enable_automatic_punctuation=True,
|
153 |
)
|
154 |
|
155 |
+
# ✅ WAV audio gönder
|
156 |
+
audio = RecognitionAudio(content=wav_audio)
|
157 |
+
|
158 |
# ✅ RAW audio gönder, WAV conversion yapmadan
|
159 |
audio = RecognitionAudio(content=audio_data) # Direkt raw PCM
|
160 |
|
|
|
206 |
return None
|
207 |
|
208 |
def _convert_to_wav(self, audio_data: bytes, sample_rate: int) -> bytes:
|
209 |
+
"""Convert raw PCM to proper WAV format"""
|
210 |
+
try:
|
211 |
+
import struct
|
212 |
+
|
213 |
+
# WAV file parameters
|
214 |
+
channels = 1
|
215 |
+
sample_width = 2 # 16-bit
|
216 |
+
frame_rate = sample_rate
|
217 |
+
audio_length = len(audio_data)
|
218 |
+
|
219 |
+
# Create proper WAV header
|
220 |
+
wav_header = struct.pack('<4sI4s4sIHHIIHH4sI',
|
221 |
+
b'RIFF', # ChunkID
|
222 |
+
36 + audio_length, # ChunkSize
|
223 |
+
b'WAVE', # Format
|
224 |
+
b'fmt ', # Subchunk1ID
|
225 |
+
16, # Subchunk1Size (PCM)
|
226 |
+
1, # AudioFormat (PCM = 1)
|
227 |
+
channels, # NumChannels
|
228 |
+
frame_rate, # SampleRate
|
229 |
+
frame_rate * channels * sample_width, # ByteRate
|
230 |
+
channels * sample_width, # BlockAlign
|
231 |
+
sample_width * 8, # BitsPerSample
|
232 |
+
b'data', # Subchunk2ID
|
233 |
+
audio_length # Subchunk2Size
|
234 |
+
)
|
235 |
+
|
236 |
+
# Combine header and audio data
|
237 |
+
wav_data = wav_header + audio_data
|
238 |
+
|
239 |
+
log_info(f"🔧 WAV conversion: {len(audio_data)} PCM → {len(wav_data)} WAV")
|
240 |
+
log_info(f"🔧 WAV specs: {channels}ch, {frame_rate}Hz, {sample_width*8}bit")
|
241 |
+
|
242 |
+
return wav_data
|
243 |
+
|
244 |
+
except Exception as e:
|
245 |
+
log_error(f"WAV conversion failed: {e}")
|
246 |
+
# Fallback to raw PCM
|
247 |
+
return audio_data
|
248 |
|
249 |
def get_supported_languages(self) -> List[str]:
|
250 |
"""Get list of supported language codes"""
|