Spaces:
Building
Building
""" | |
Google Cloud Speech-to-Text Implementation - Simple Batch Mode | |
""" | |
from typing import Optional, List | |
from datetime import datetime | |
import io | |
import wave | |
import struct | |
from google.cloud import speech | |
from google.cloud.speech import RecognitionConfig, RecognitionAudio | |
from utils.logger import log_info, log_error, log_debug, log_warning | |
from .stt_interface import STTInterface, STTConfig, TranscriptionResult | |
class GoogleSTT(STTInterface): | |
def __init__(self, credentials_path: Optional[str] = None): | |
""" | |
Initialize Google STT | |
Args: | |
credentials_path: Path to service account JSON file (optional if using default credentials) | |
""" | |
try: | |
# Initialize client | |
if credentials_path: | |
self.client = speech.SpeechClient.from_service_account_file(credentials_path) | |
log_info(f"✅ Google STT initialized with service account: {credentials_path}") | |
else: | |
# Use default credentials (ADC) | |
self.client = speech.SpeechClient() | |
log_info("✅ Google STT initialized with default credentials") | |
except Exception as e: | |
log_error(f"❌ Failed to initialize Google STT: {str(e)}") | |
raise | |
def _map_language_code(self, language: str) -> str: | |
"""Map language codes to Google format""" | |
# Google uses BCP-47 language codes | |
language_map = { | |
"tr": "tr-TR", | |
"tr-TR": "tr-TR", | |
"en": "en-US", | |
"en-US": "en-US", | |
"en-GB": "en-GB", | |
"de": "de-DE", | |
"de-DE": "de-DE", | |
"fr": "fr-FR", | |
"fr-FR": "fr-FR", | |
"es": "es-ES", | |
"es-ES": "es-ES", | |
"it": "it-IT", | |
"it-IT": "it-IT", | |
"pt": "pt-BR", | |
"pt-BR": "pt-BR", | |
"ru": "ru-RU", | |
"ru-RU": "ru-RU", | |
"ja": "ja-JP", | |
"ja-JP": "ja-JP", | |
"ko": "ko-KR", | |
"ko-KR": "ko-KR", | |
"zh": "zh-CN", | |
"zh-CN": "zh-CN", | |
"ar": "ar-SA", | |
"ar-SA": "ar-SA", | |
} | |
# Default to the language itself if not in map | |
return language_map.get(language, language) | |
def _analyze_audio_content(self, audio_data: bytes): | |
"""Analyze audio content for debugging""" | |
try: | |
if len(audio_data) < 100: | |
log_warning(f"⚠️ Very short audio data: {len(audio_data)} bytes") | |
return | |
# Convert to samples | |
samples = struct.unpack(f'{len(audio_data)//2}h', audio_data) | |
total_samples = len(samples) | |
# Basic stats | |
non_zero_samples = [s for s in samples if s != 0] | |
zero_count = total_samples - len(non_zero_samples) | |
zero_percentage = (zero_count / total_samples) * 100 | |
log_info(f"🔍 Audio stats: {total_samples} total samples, {zero_count} zeros ({zero_percentage:.1f}%)") | |
if non_zero_samples: | |
avg_amplitude = sum(abs(s) for s in non_zero_samples) / len(non_zero_samples) | |
max_amplitude = max(abs(s) for s in non_zero_samples) | |
log_info(f"🔍 Non-zero stats: avg={avg_amplitude:.1f}, max={max_amplitude}") | |
# Section analysis | |
section_size = total_samples // 10 | |
log_info(f"🔍 Section analysis (each {section_size} samples):") | |
for i in range(10): | |
start = i * section_size | |
end = min((i + 1) * section_size, total_samples) | |
section = samples[start:end] | |
section_non_zero = [s for s in section if s != 0] | |
section_zeros = len(section) - len(section_non_zero) | |
section_zero_pct = (section_zeros / len(section)) * 100 | |
if section_non_zero: | |
section_max = max(abs(s) for s in section_non_zero) | |
section_avg = sum(abs(s) for s in section_non_zero) / len(section_non_zero) | |
log_info(f"Section {i+1}: max={section_max}, avg={section_avg:.1f}, zeros={section_zero_pct:.1f}%") | |
# Find where speech starts (first significant activity) | |
speech_threshold = 1000 # Minimum amplitude to consider as speech | |
speech_start = None | |
for i, sample in enumerate(samples): | |
if abs(sample) > speech_threshold: | |
speech_start = i | |
break | |
if speech_start is not None: | |
log_info(f"🎤 Speech detected starting at sample {speech_start} ({speech_start/16000:.2f}s)") | |
else: | |
log_warning(f"⚠️ No clear speech signal detected (threshold: {speech_threshold})") | |
else: | |
log_warning(f"⚠️ All samples are zero - no audio content") | |
except Exception as e: | |
log_error(f"❌ Error analyzing audio: {e}") | |
def _trim_silence(self, audio_data: bytes) -> bytes: | |
"""Trim silence from beginning and end of audio""" | |
try: | |
if len(audio_data) < 100: | |
return audio_data | |
# Convert to samples | |
samples = list(struct.unpack(f'{len(audio_data)//2}h', audio_data)) | |
# Silence threshold - daha düşük bir threshold kullan | |
silence_threshold = 200 # Daha düşük threshold | |
# Find first non-silent sample | |
start_idx = 0 | |
for i, sample in enumerate(samples): | |
if abs(sample) > silence_threshold: | |
start_idx = i | |
break | |
# Find last non-silent sample | |
end_idx = len(samples) - 1 | |
for i in range(len(samples) - 1, -1, -1): | |
if abs(samples[i]) > silence_threshold: | |
end_idx = i | |
break | |
# Ensure we have some audio | |
if start_idx >= end_idx: | |
log_warning("⚠️ No audio content above silence threshold") | |
return audio_data | |
# Add small padding (250ms = 4000 samples at 16kHz) | |
padding = 2000 # 125ms padding | |
start_idx = max(0, start_idx - padding) | |
end_idx = min(len(samples) - 1, end_idx + padding) | |
# Extract trimmed audio | |
trimmed_samples = samples[start_idx:end_idx + 1] | |
log_info(f"🔧 Silence trimming: {len(samples)} → {len(trimmed_samples)} samples") | |
log_info(f"🔧 Trimmed duration: {len(trimmed_samples)/16000:.2f}s") | |
# Convert back to bytes | |
trimmed_audio = struct.pack(f'{len(trimmed_samples)}h', *trimmed_samples) | |
return trimmed_audio | |
except Exception as e: | |
log_error(f"❌ Silence trimming failed: {e}") | |
return audio_data | |
async def transcribe(self, audio_data: bytes, config: STTConfig) -> Optional[TranscriptionResult]: | |
"""Transcribe audio data using Google Cloud Speech API""" | |
try: | |
# Check if we have audio to transcribe | |
if not audio_data: | |
log_warning("⚠️ No audio data provided") | |
return None | |
log_info(f"📊 Transcribing {len(audio_data)} bytes of audio") | |
# ✅ Raw audio'yu direkt WAV olarak kaydet ve test et | |
import tempfile | |
import os | |
import wave | |
# Raw audio'yu WAV olarak kaydet | |
raw_wav_file = f"/tmp/raw_audio_{datetime.now().strftime('%H%M%S')}.wav" | |
with wave.open(raw_wav_file, 'wb') as wav_file: | |
wav_file.setnchannels(1) | |
wav_file.setsampwidth(2) | |
wav_file.setframerate(config.sample_rate) | |
wav_file.writeframes(audio_data) | |
log_info(f"🎯 RAW audio saved as WAV: {raw_wav_file}") | |
# Test koduyla test et | |
try: | |
import subprocess | |
result = subprocess.run([ | |
'python', './test_single_wav.py', raw_wav_file | |
], capture_output=True, text=True, timeout=30) | |
log_info(f"🔍 Raw WAV test result: {result.stdout}") | |
if result.stderr: | |
log_error(f"🔍 Raw WAV test error: {result.stderr}") | |
# Eğer raw audio çalışıyorsa, sorun trimming'te | |
if "Transcript:" in result.stdout: | |
log_info("✅ RAW audio works! Problem is in our processing.") | |
else: | |
log_error("❌ Even RAW audio doesn't work - problem in frontend!") | |
except Exception as e: | |
log_warning(f"Could not run raw audio test: {e}") | |
# ✅ Audio analizi | |
self._analyze_audio_content(audio_data) | |
# ✅ Silence trimming ekle | |
trimmed_audio = self._trim_silence(audio_data) | |
if len(trimmed_audio) < 8000: # 0.5 saniyeden az | |
log_warning("⚠️ Audio too short after trimming") | |
return None | |
# Trimmed audio'yu da kaydet | |
trimmed_wav_file = f"/tmp/trimmed_audio_{datetime.now().strftime('%H%M%S')}.wav" | |
with wave.open(trimmed_wav_file, 'wb') as wav_file: | |
wav_file.setnchannels(1) | |
wav_file.setsampwidth(2) | |
wav_file.setframerate(config.sample_rate) | |
wav_file.writeframes(trimmed_audio) | |
log_info(f"🎯 TRIMMED audio saved as WAV: {trimmed_wav_file}") | |
# Trimmed audio'yu da test et | |
try: | |
result = subprocess.run([ | |
'python', '/app/test_single_wav.py', trimmed_wav_file | |
], capture_output=True, text=True, timeout=30) | |
log_info(f"🔍 Trimmed WAV test result: {result.stdout}") | |
if result.stderr: | |
log_error(f"🔍 Trimmed WAV test error: {result.stderr}") | |
except Exception as e: | |
log_warning(f"Could not run trimmed audio test: {e}") | |
# Sonuç olarak Google'a gönderme | |
log_info("❌ Skipping Google API call for debugging") | |
return None | |
except Exception as e: | |
log_error(f"❌ Error during transcription: {str(e)}") | |
import traceback | |
log_error(f"Traceback: {traceback.format_exc()}") | |
return None | |
def _create_wav_like_test(self, audio_data: bytes, sample_rate: int) -> bytes: | |
"""Create WAV exactly like test code using wave module""" | |
try: | |
import tempfile | |
import os | |
import wave | |
# Geçici dosya oluştur | |
temp_wav = tempfile.mktemp(suffix='.wav') | |
try: | |
# Wave file oluştur - test kodundaki gibi | |
with wave.open(temp_wav, 'wb') as wav_file: | |
wav_file.setnchannels(1) # Mono | |
wav_file.setsampwidth(2) # 16-bit | |
wav_file.setframerate(sample_rate) # 16kHz | |
wav_file.writeframes(audio_data) | |
# Dosyayı geri oku | |
with open(temp_wav, 'rb') as f: | |
wav_data = f.read() | |
log_info(f"🔧 WAV created using wave module: {len(wav_data)} bytes") | |
# Debug: Wave file'ı kontrol et | |
with wave.open(temp_wav, 'rb') as wav_file: | |
log_info(f"🔧 Wave validation: {wav_file.getnchannels()}ch, {wav_file.getframerate()}Hz, {wav_file.getnframes()} frames") | |
return wav_data | |
finally: | |
# Cleanup | |
if os.path.exists(temp_wav): | |
os.unlink(temp_wav) | |
except Exception as e: | |
log_error(f"❌ Wave module WAV creation failed: {e}") | |
# Fallback to manual method | |
return self._convert_to_wav_proper(audio_data, sample_rate) | |
def get_supported_languages(self) -> List[str]: | |
"""Get list of supported language codes""" | |
# Google Cloud Speech-to-Text supported languages (partial list) | |
return [ | |
"tr-TR", "en-US", "en-GB", "en-AU", "en-CA", "en-IN", | |
"es-ES", "es-MX", "es-AR", "fr-FR", "fr-CA", "de-DE", | |
"it-IT", "pt-BR", "pt-PT", "ru-RU", "ja-JP", "ko-KR", | |
"zh-CN", "zh-TW", "ar-SA", "ar-EG", "hi-IN", "nl-NL", | |
"pl-PL", "sv-SE", "da-DK", "no-NO", "fi-FI", "el-GR", | |
"he-IL", "th-TH", "vi-VN", "id-ID", "ms-MY", "fil-PH" | |
] | |
def get_provider_name(self) -> str: | |
"""Get provider name""" | |
return "google" |