Spaces:
Running
Running
Update stt/stt_google.py
Browse files- stt/stt_google.py +51 -51
stt/stt_google.py
CHANGED
@@ -127,56 +127,56 @@ class GoogleSTT(STTInterface):
|
|
127 |
log_error(f"❌ Error analyzing audio: {e}")
|
128 |
|
129 |
def _trim_silence(self, audio_data: bytes) -> bytes:
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
return audio_data
|
159 |
-
|
160 |
-
# Add small padding (250ms = 4000 samples at 16kHz)
|
161 |
-
padding = 2000 # 125ms padding
|
162 |
-
start_idx = max(0, start_idx - padding)
|
163 |
-
end_idx = min(len(samples) - 1, end_idx + padding)
|
164 |
-
|
165 |
-
# Extract trimmed audio
|
166 |
-
trimmed_samples = samples[start_idx:end_idx + 1]
|
167 |
-
|
168 |
-
log_info(f"🔧 Silence trimming: {len(samples)} → {len(trimmed_samples)} samples")
|
169 |
-
log_info(f"🔧 Trimmed duration: {len(trimmed_samples)/16000:.2f}s")
|
170 |
-
|
171 |
-
# Convert back to bytes
|
172 |
-
trimmed_audio = struct.pack(f'{len(trimmed_samples)}h', *trimmed_samples)
|
173 |
-
|
174 |
-
return trimmed_audio
|
175 |
-
|
176 |
-
except Exception as e:
|
177 |
-
log_error(f"❌ Silence trimming failed: {e}")
|
178 |
return audio_data
|
179 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
180 |
async def transcribe(self, audio_data: bytes, config: STTConfig) -> Optional[TranscriptionResult]:
|
181 |
"""Transcribe audio data using Google Cloud Speech API"""
|
182 |
try:
|
@@ -194,8 +194,8 @@ class GoogleSTT(STTInterface):
|
|
194 |
trimmed_audio = self._trim_silence(audio_data)
|
195 |
|
196 |
# ✅ WAV formatında gönder - Google bu formatı daha iyi tanıyor
|
197 |
-
wav_audio = self._convert_to_wav_proper(
|
198 |
-
log_info(f"🔧 WAV conversion: {len(
|
199 |
|
200 |
import tempfile
|
201 |
import os
|
@@ -203,7 +203,7 @@ class GoogleSTT(STTInterface):
|
|
203 |
# Raw PCM kaydet
|
204 |
pcm_file = tempfile.mktemp(suffix='.pcm')
|
205 |
with open(pcm_file, 'wb') as f:
|
206 |
-
f.write(
|
207 |
log_info(f"🔍 Raw PCM saved to: {pcm_file}")
|
208 |
|
209 |
# WAV kaydet
|
|
|
127 |
log_error(f"❌ Error analyzing audio: {e}")
|
128 |
|
129 |
def _trim_silence(self, audio_data: bytes) -> bytes:
|
130 |
+
"""Trim silence from beginning and end of audio"""
|
131 |
+
try:
|
132 |
+
if len(audio_data) < 100:
|
133 |
+
return audio_data
|
134 |
+
|
135 |
+
# Convert to samples
|
136 |
+
samples = list(struct.unpack(f'{len(audio_data)//2}h', audio_data))
|
137 |
+
|
138 |
+
# Silence threshold - daha düşük bir threshold kullan
|
139 |
+
silence_threshold = 200 # Daha düşük threshold
|
140 |
+
|
141 |
+
# Find first non-silent sample
|
142 |
+
start_idx = 0
|
143 |
+
for i, sample in enumerate(samples):
|
144 |
+
if abs(sample) > silence_threshold:
|
145 |
+
start_idx = i
|
146 |
+
break
|
147 |
+
|
148 |
+
# Find last non-silent sample
|
149 |
+
end_idx = len(samples) - 1
|
150 |
+
for i in range(len(samples) - 1, -1, -1):
|
151 |
+
if abs(samples[i]) > silence_threshold:
|
152 |
+
end_idx = i
|
153 |
+
break
|
154 |
+
|
155 |
+
# Ensure we have some audio
|
156 |
+
if start_idx >= end_idx:
|
157 |
+
log_warning("⚠️ No audio content above silence threshold")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
return audio_data
|
159 |
|
160 |
+
# Add small padding (250ms = 4000 samples at 16kHz)
|
161 |
+
padding = 2000 # 125ms padding
|
162 |
+
start_idx = max(0, start_idx - padding)
|
163 |
+
end_idx = min(len(samples) - 1, end_idx + padding)
|
164 |
+
|
165 |
+
# Extract trimmed audio
|
166 |
+
trimmed_samples = samples[start_idx:end_idx + 1]
|
167 |
+
|
168 |
+
log_info(f"🔧 Silence trimming: {len(samples)} → {len(trimmed_samples)} samples")
|
169 |
+
log_info(f"🔧 Trimmed duration: {len(trimmed_samples)/16000:.2f}s")
|
170 |
+
|
171 |
+
# Convert back to bytes
|
172 |
+
trimmed_audio = struct.pack(f'{len(trimmed_samples)}h', *trimmed_samples)
|
173 |
+
|
174 |
+
return trimmed_audio
|
175 |
+
|
176 |
+
except Exception as e:
|
177 |
+
log_error(f"❌ Silence trimming failed: {e}")
|
178 |
+
return audio_data
|
179 |
+
|
180 |
async def transcribe(self, audio_data: bytes, config: STTConfig) -> Optional[TranscriptionResult]:
|
181 |
"""Transcribe audio data using Google Cloud Speech API"""
|
182 |
try:
|
|
|
194 |
trimmed_audio = self._trim_silence(audio_data)
|
195 |
|
196 |
# ✅ WAV formatında gönder - Google bu formatı daha iyi tanıyor
|
197 |
+
wav_audio = self._convert_to_wav_proper(trimmed_audio, config.sample_rate)
|
198 |
+
log_info(f"🔧 WAV conversion: {len(trimmed_audio)} PCM → {len(wav_audio)} WAV")
|
199 |
|
200 |
import tempfile
|
201 |
import os
|
|
|
203 |
# Raw PCM kaydet
|
204 |
pcm_file = tempfile.mktemp(suffix='.pcm')
|
205 |
with open(pcm_file, 'wb') as f:
|
206 |
+
f.write(trimmed_audio)
|
207 |
log_info(f"🔍 Raw PCM saved to: {pcm_file}")
|
208 |
|
209 |
# WAV kaydet
|