Spaces:
Building
Building
Update stt/stt_google.py
Browse files- stt/stt_google.py +91 -57
stt/stt_google.py
CHANGED
@@ -67,98 +67,132 @@ class GoogleSTT(STTInterface):
|
|
67 |
return language_map.get(language, language)
|
68 |
|
69 |
async def transcribe(self, audio_data: bytes, config: STTConfig) -> Optional[TranscriptionResult]:
|
70 |
-
"""Transcribe audio data using Google Cloud Speech API"""
|
71 |
try:
|
72 |
-
# Check if we have audio to transcribe
|
73 |
if not audio_data:
|
74 |
log_warning("⚠️ No audio data provided")
|
75 |
return None
|
76 |
|
77 |
log_info(f"📊 Transcribing {len(audio_data)} bytes of audio")
|
78 |
|
79 |
-
# ✅
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
wav_audio = self._convert_to_wav(audio_data, config.sample_rate)
|
96 |
|
97 |
# Configure recognition
|
98 |
-
language_code = self._map_language_code(config.language)
|
99 |
-
|
100 |
-
"""
|
101 |
-
recognition_config = RecognitionConfig(
|
102 |
-
encoding=RecognitionConfig.AudioEncoding.LINEAR16,
|
103 |
-
sample_rate_hertz=config.sample_rate,
|
104 |
-
language_code=language_code,
|
105 |
-
enable_automatic_punctuation=config.enable_punctuation,
|
106 |
-
model=config.model,
|
107 |
-
use_enhanced=config.use_enhanced,
|
108 |
-
enable_word_time_offsets=config.enable_word_timestamps,
|
109 |
-
)
|
110 |
-
"""
|
111 |
-
|
112 |
recognition_config = RecognitionConfig(
|
113 |
encoding=RecognitionConfig.AudioEncoding.LINEAR16,
|
114 |
sample_rate_hertz=16000,
|
115 |
language_code="tr-TR",
|
116 |
-
audio_channel_count=1,
|
117 |
enable_separate_recognition_per_channel=False,
|
|
|
118 |
)
|
119 |
-
|
120 |
-
log_debug(f"Recognition config: language={language_code}, sample_rate={config.sample_rate}, model={config.model}")
|
121 |
-
|
122 |
# Create audio object
|
123 |
audio = RecognitionAudio(content=wav_audio)
|
124 |
|
125 |
# Perform synchronous recognition
|
126 |
log_info(f"🔄 Sending audio to Google Cloud Speech API...")
|
127 |
response = self.client.recognize(config=recognition_config, audio=audio)
|
128 |
-
|
129 |
-
# ✅
|
130 |
-
|
131 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
# Process results
|
133 |
-
if response.results:
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
result = response.results[0]
|
135 |
-
if result.alternatives:
|
136 |
alternative = result.alternatives[0]
|
137 |
|
138 |
-
# Extract word timestamps if available
|
139 |
-
word_timestamps = None
|
140 |
-
if config.enable_word_timestamps and hasattr(alternative, 'words'):
|
141 |
-
word_timestamps = [
|
142 |
-
{
|
143 |
-
"word": word_info.word,
|
144 |
-
"start_time": word_info.start_time.total_seconds(),
|
145 |
-
"end_time": word_info.end_time.total_seconds()
|
146 |
-
}
|
147 |
-
for word_info in alternative.words
|
148 |
-
]
|
149 |
-
|
150 |
transcription = TranscriptionResult(
|
151 |
text=alternative.transcript,
|
152 |
confidence=alternative.confidence,
|
153 |
timestamp=datetime.now().timestamp(),
|
154 |
-
language=
|
155 |
-
word_timestamps=
|
156 |
)
|
157 |
|
158 |
-
log_info(f"✅ Transcription: '{alternative.transcript}' (confidence: {alternative.confidence:.2f})")
|
159 |
return transcription
|
160 |
|
161 |
-
log_warning("⚠️ No transcription results")
|
162 |
return None
|
163 |
|
164 |
except Exception as e:
|
|
|
67 |
return language_map.get(language, language)
|
68 |
|
69 |
async def transcribe(self, audio_data: bytes, config: STTConfig) -> Optional[TranscriptionResult]:
|
|
|
70 |
try:
|
|
|
71 |
if not audio_data:
|
72 |
log_warning("⚠️ No audio data provided")
|
73 |
return None
|
74 |
|
75 |
log_info(f"📊 Transcribing {len(audio_data)} bytes of audio")
|
76 |
|
77 |
+
# ✅ Detaylı audio analizi - logda
|
78 |
+
import struct
|
79 |
+
samples = struct.unpack(f'{len(audio_data)//2}h', audio_data)
|
80 |
+
total_samples = len(samples)
|
81 |
+
|
82 |
+
# 1. Genel istatistikler
|
83 |
+
non_zero_samples = [s for s in samples if s != 0]
|
84 |
+
zero_count = total_samples - len(non_zero_samples)
|
85 |
+
|
86 |
+
if non_zero_samples:
|
87 |
+
avg_amplitude = sum(abs(s) for s in non_zero_samples) / len(non_zero_samples)
|
88 |
+
max_amplitude = max(abs(s) for s in non_zero_samples)
|
89 |
+
else:
|
90 |
+
avg_amplitude = 0
|
91 |
+
max_amplitude = 0
|
92 |
+
|
93 |
+
log_info(f"🔍 Audio stats: {total_samples} total samples, {zero_count} zeros ({zero_count/total_samples:.1%})")
|
94 |
+
log_info(f"🔍 Non-zero stats: avg={avg_amplitude:.1f}, max={max_amplitude}")
|
95 |
+
|
96 |
+
# 2. Bölüm bazlı analiz (10 bölüme ayır)
|
97 |
+
section_size = total_samples // 10
|
98 |
+
log_info(f"🔍 Section analysis (each {section_size} samples):")
|
99 |
+
|
100 |
+
for i in range(10):
|
101 |
+
start_idx = i * section_size
|
102 |
+
end_idx = (i + 1) * section_size if i < 9 else total_samples
|
103 |
+
section = samples[start_idx:end_idx]
|
104 |
+
|
105 |
+
section_non_zero = [s for s in section if s != 0]
|
106 |
+
section_max = max(abs(s) for s in section_non_zero) if section_non_zero else 0
|
107 |
+
section_avg = sum(abs(s) for s in section_non_zero) / len(section_non_zero) if section_non_zero else 0
|
108 |
+
zero_ratio = (len(section) - len(section_non_zero)) / len(section)
|
109 |
+
|
110 |
+
log_info(f" Section {i+1}: max={section_max}, avg={section_avg:.1f}, zeros={zero_ratio:.1%}")
|
111 |
+
|
112 |
+
# 3. İlk konuşma başlangıcını bul
|
113 |
+
speech_threshold = 500 # RMS eşiği
|
114 |
+
speech_start_idx = -1
|
115 |
+
|
116 |
+
# 100 sample'lık pencerelerle RMS hesapla
|
117 |
+
window_size = 100
|
118 |
+
for i in range(0, total_samples - window_size, window_size):
|
119 |
+
window = samples[i:i + window_size]
|
120 |
+
rms = (sum(s * s for s in window) / window_size) ** 0.5
|
121 |
|
122 |
+
if rms > speech_threshold:
|
123 |
+
speech_start_idx = i
|
124 |
+
break
|
125 |
+
|
126 |
+
if speech_start_idx >= 0:
|
127 |
+
speech_start_time = speech_start_idx / config.sample_rate
|
128 |
+
log_info(f"🎤 Speech detected starting at sample {speech_start_idx} ({speech_start_time:.2f}s)")
|
129 |
+
else:
|
130 |
+
log_warning("⚠️ No speech detected above threshold in entire audio")
|
131 |
+
|
132 |
+
# 4. Audio'nun gerçekten boş olup olmadığını kontrol et
|
133 |
+
if max_amplitude < 100:
|
134 |
+
log_warning(f"⚠️ Audio appears silent: max_amplitude={max_amplitude}")
|
135 |
+
return None
|
136 |
+
|
137 |
+
if zero_count / total_samples > 0.95: # %95'den fazla sıfır
|
138 |
+
log_warning(f"⚠️ Audio is mostly zeros: {zero_count/total_samples:.1%}")
|
139 |
+
return None
|
140 |
+
|
141 |
+
# Convert to WAV format
|
142 |
wav_audio = self._convert_to_wav(audio_data, config.sample_rate)
|
143 |
|
144 |
# Configure recognition
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
recognition_config = RecognitionConfig(
|
146 |
encoding=RecognitionConfig.AudioEncoding.LINEAR16,
|
147 |
sample_rate_hertz=16000,
|
148 |
language_code="tr-TR",
|
149 |
+
audio_channel_count=1,
|
150 |
enable_separate_recognition_per_channel=False,
|
151 |
+
enable_automatic_punctuation=True,
|
152 |
)
|
153 |
+
|
|
|
|
|
154 |
# Create audio object
|
155 |
audio = RecognitionAudio(content=wav_audio)
|
156 |
|
157 |
# Perform synchronous recognition
|
158 |
log_info(f"🔄 Sending audio to Google Cloud Speech API...")
|
159 |
response = self.client.recognize(config=recognition_config, audio=audio)
|
160 |
+
|
161 |
+
# ✅ Detaylı response analizi
|
162 |
+
log_info(f"🔍 Google response details:")
|
163 |
+
log_info(f" - Has results: {bool(response.results)}")
|
164 |
+
log_info(f" - Results count: {len(response.results) if response.results else 0}")
|
165 |
+
|
166 |
+
if hasattr(response, 'total_billed_time'):
|
167 |
+
if response.total_billed_time and response.total_billed_time.total_seconds() > 0:
|
168 |
+
log_info(f" - Billed time: {response.total_billed_time.total_seconds()}s")
|
169 |
+
else:
|
170 |
+
log_info(f" - Billed time: 0s (no audio processed)")
|
171 |
+
|
172 |
# Process results
|
173 |
+
if response.results and len(response.results) > 0:
|
174 |
+
for i, result in enumerate(response.results):
|
175 |
+
log_info(f" - Result {i}: {len(result.alternatives)} alternatives")
|
176 |
+
if result.alternatives:
|
177 |
+
for j, alt in enumerate(result.alternatives):
|
178 |
+
log_info(f" - Alt {j}: '{alt.transcript}' (conf: {alt.confidence:.3f})")
|
179 |
+
|
180 |
result = response.results[0]
|
181 |
+
if result.alternatives and len(result.alternatives) > 0:
|
182 |
alternative = result.alternatives[0]
|
183 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
transcription = TranscriptionResult(
|
185 |
text=alternative.transcript,
|
186 |
confidence=alternative.confidence,
|
187 |
timestamp=datetime.now().timestamp(),
|
188 |
+
language="tr-TR",
|
189 |
+
word_timestamps=None
|
190 |
)
|
191 |
|
192 |
+
log_info(f"✅ Transcription SUCCESS: '{alternative.transcript}' (confidence: {alternative.confidence:.2f})")
|
193 |
return transcription
|
194 |
|
195 |
+
log_warning("⚠️ No transcription results - Google couldn't recognize speech")
|
196 |
return None
|
197 |
|
198 |
except Exception as e:
|