Spaces:
Building
Building
""" | |
Google Cloud Speech-to-Text Implementation | |
""" | |
import os | |
import asyncio | |
from typing import AsyncIterator, Optional, List, Any | |
from datetime import datetime | |
import sys | |
import queue | |
import threading | |
from logger import log_info, log_error, log_debug, log_warning | |
# Import Google Cloud Speech only if available | |
try: | |
from google.cloud import speech | |
from google.api_core import exceptions | |
GOOGLE_SPEECH_AVAILABLE = True | |
except ImportError: | |
GOOGLE_SPEECH_AVAILABLE = False | |
log_info("⚠️ Google Cloud Speech library not installed") | |
from stt_interface import STTInterface, STTConfig, TranscriptionResult | |
class GoogleCloudSTT(STTInterface): | |
"""Google Cloud Speech-to-Text implementation""" | |
def __init__(self, credentials_path: str): | |
if not GOOGLE_SPEECH_AVAILABLE: | |
raise ImportError("google-cloud-speech library not installed. Run: pip install google-cloud-speech") | |
if credentials_path and os.path.exists(credentials_path): | |
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credentials_path | |
log_info(f"✅ Google credentials set from: {credentials_path}") | |
# Test credential'ları | |
try: | |
self.client = speech.SpeechClient() | |
# Basit bir test çağrısı | |
log_info("🔐 Testing Google credentials...") | |
# Bu sadece client'ın oluşturulabildiğini test eder | |
log_info("✅ Google credentials valid") | |
except Exception as e: | |
log_error(f"❌ Google credentials error", error=str(e)) | |
raise | |
else: | |
log_error(f"❌ Google credentials path not found: {credentials_path}") | |
raise FileNotFoundError(f"Credentials file not found: {credentials_path}") | |
self.client = speech.SpeechClient() | |
self.streaming_config = None | |
self.is_streaming = False | |
self.audio_queue = queue.Queue() | |
self.responses_queue = queue.Queue() # Normal Queue, asyncio.Queue değil! | |
self.stream_thread = None | |
self.stop_event = threading.Event() | |
async def start_streaming(self, config: dict) -> None: | |
"""Initialize streaming session""" | |
try: | |
log_info(f"🎤 Starting Google STT streaming with config: {config}") | |
# Convert dict to STTConfig if needed | |
if isinstance(config, dict): | |
stt_config = STTConfig( | |
language=config.get("language", "tr-TR"), | |
sample_rate=config.get("sample_rate", 16000), | |
encoding=config.get("encoding", "WEBM_OPUS"), | |
enable_punctuation=config.get("enable_punctuation", True), | |
interim_results=config.get("interim_results", True), | |
single_utterance=config.get("single_utterance", False) | |
) | |
else: | |
stt_config = config | |
recognition_config = speech.RecognitionConfig( | |
encoding=self._get_encoding(stt_config.encoding), | |
sample_rate_hertz=stt_config.sample_rate, | |
language_code=stt_config.language, | |
enable_automatic_punctuation=stt_config.enable_punctuation, | |
model="latest_long", | |
use_enhanced=True | |
) | |
self.streaming_config = speech.StreamingRecognitionConfig( | |
config=recognition_config, | |
interim_results=stt_config.interim_results, | |
single_utterance=stt_config.single_utterance | |
) | |
self.is_streaming = True | |
self.stop_event.clear() | |
# Start streaming thread | |
self.stream_thread = threading.Thread(target=self._run_stream) | |
self.stream_thread.start() | |
log_info("✅ Google STT streaming started successfully") | |
except Exception as e: | |
log_error(f"❌ Failed to start Google STT streaming", error=str(e)) | |
self.is_streaming = False | |
raise | |
def _put_result(self, result: TranscriptionResult): | |
"""Helper to put result in queue""" | |
try: | |
self.responses_queue.put(result) | |
# Debug log'u kaldırdık | |
except Exception as e: | |
log_error(f"❌ Error queuing result: {e}") | |
def _run_stream(self): | |
"""Run the streaming recognition in a separate thread""" | |
try: | |
log_info("🎤 Google STT stream thread started") | |
def request_generator(): | |
"""Generate streaming requests""" | |
chunk_count = 0 | |
start_time = datetime.now() | |
while not self.stop_event.is_set(): | |
try: | |
# 5 dakika sınırına yaklaşıyorsak stream'i sonlandır | |
elapsed = (datetime.now() - start_time).total_seconds() | |
if elapsed > 280: # 4 dakika 40 saniye - güvenli margin | |
log_warning(f"⚠️ Approaching 5-minute limit ({elapsed:.1f}s), ending stream gracefully") | |
break | |
# Get audio chunk with timeout | |
chunk = self.audio_queue.get(timeout=0.1) | |
if chunk is None: # Poison pill | |
log_info("📛 Poison pill received, stopping request generator") | |
break | |
chunk_count += 1 | |
# Sadece önemli milestone'larda logla | |
if chunk_count == 1: | |
log_info(f"📤 First chunk sent to Google STT, size: {len(chunk)} bytes") | |
elif chunk_count % 100 == 0: | |
log_info(f"📤 Sent {chunk_count} chunks to Google STT (elapsed: {elapsed:.1f}s)") | |
yield speech.StreamingRecognizeRequest(audio_content=chunk) | |
except queue.Empty: | |
continue | |
except Exception as e: | |
log_error(f"❌ Error in request generator: {e}") | |
break | |
# Create streaming client | |
requests = request_generator() | |
log_info("🎤 Creating Google STT streaming client...") | |
try: | |
responses = self.client.streaming_recognize(self.streaming_config, requests) | |
log_info("✅ Google STT streaming client created") | |
# Process responses | |
response_count = 0 | |
empty_response_count = 0 | |
for response in responses: | |
response_count += 1 | |
if self.stop_event.is_set(): | |
log_info("🛑 Stop event detected, breaking response loop") | |
break | |
# Boş response'ları say ama loglama | |
if not response.results: | |
empty_response_count += 1 | |
if empty_response_count % 50 == 0: | |
log_warning(f"⚠️ Received {empty_response_count} empty responses from Google STT") | |
continue | |
for result in response.results: | |
if not result.alternatives: | |
continue | |
# İlk alternatifi al | |
alternative = result.alternatives[0] | |
# Sadece anlamlı text'leri işle | |
if alternative.transcript.strip(): | |
# Create transcription result | |
transcription = TranscriptionResult( | |
text=alternative.transcript, | |
is_final=result.is_final, | |
confidence=alternative.confidence if hasattr(alternative, 'confidence') and alternative.confidence else 0.0, | |
timestamp=datetime.now().timestamp() | |
) | |
# Put result in queue | |
self._put_result(transcription) | |
# SADECE final result'ları logla | |
if result.is_final: | |
log_info(f"🎯 GOOGLE STT FINAL: '{alternative.transcript}'") | |
log_info(f"📊 Google STT stream ended. Total responses: {response_count}, Empty: {empty_response_count}") | |
except Exception as e: | |
error_msg = str(e) | |
# Detaylı hata mesajları | |
if "Exceeded maximum allowed stream duration" in error_msg: | |
log_warning("⚠️ Stream duration limit exceeded (5 minutes). This is expected for long sessions.") | |
# Bu bir error değil, normal davranış - result queue'ya error koymuyoruz | |
elif "Bad language code" in error_msg: | |
log_error(f"❌ Invalid language code in STT config. Check locale settings.") | |
elif "invalid_argument" in error_msg: | |
log_error(f"❌ Invalid STT configuration. Check encoding and sample rate.") | |
elif "Deadline Exceeded" in error_msg: | |
log_error(f"❌ Google STT timeout - possibly network issue or slow connection") | |
else: | |
log_error(f"❌ Google STT stream error: {error_msg}") | |
except Exception as e: | |
log_error(f"❌ Fatal error in STT stream thread", error=str(e), traceback=traceback.format_exc()) | |
finally: | |
log_info("🎤 Google STT stream thread ended") | |
async def stream_audio(self, audio_chunk: bytes) -> AsyncIterator[TranscriptionResult]: | |
"""Stream audio chunk and get transcription results""" | |
if not self.is_streaming: | |
raise RuntimeError("Streaming not started. Call start_streaming() first.") | |
try: | |
# Put audio in queue for streaming thread | |
self.audio_queue.put(audio_chunk) | |
# Check for any results in queue | |
while True: | |
try: | |
# Non-blocking get from normal queue | |
result = self.responses_queue.get_nowait() | |
# Debug log'u kaldırdık | |
yield result | |
except queue.Empty: | |
# No more results in queue | |
break | |
except Exception as e: | |
log_error(f"❌ Google STT streaming error", error=str(e)) | |
raise | |
async def stop_streaming(self) -> Optional[TranscriptionResult]: | |
"""Stop streaming and get final result""" | |
if not self.is_streaming: | |
return None | |
try: | |
log_info("🛑 Stopping Google STT streaming...") | |
self.is_streaming = False | |
self.stop_event.set() | |
# Send poison pill to queue | |
self.audio_queue.put(None) | |
# Wait for thread to finish | |
if self.stream_thread: | |
self.stream_thread.join(timeout=5.0) | |
# Clear queues | |
while not self.audio_queue.empty(): | |
self.audio_queue.get_nowait() | |
final_result = None | |
while not self.responses_queue.empty(): | |
result = await self.responses_queue.get() | |
if result.is_final: | |
final_result = result | |
log_info("✅ Google STT streaming stopped") | |
return final_result | |
except Exception as e: | |
log_error(f"❌ Failed to stop Google STT streaming", error=str(e)) | |
return None | |
def supports_realtime(self) -> bool: | |
"""Google Cloud STT supports real-time streaming""" | |
return True | |
def get_supported_languages(self) -> List[str]: | |
"""Get list of supported language codes""" | |
return [ | |
"tr-TR", # Turkish | |
"en-US", # English (US) | |
"en-GB", # English (UK) | |
"de-DE", # German | |
"fr-FR", # French | |
"es-ES", # Spanish | |
"it-IT", # Italian | |
"pt-BR", # Portuguese (Brazil) | |
"ru-RU", # Russian | |
"ja-JP", # Japanese | |
"ko-KR", # Korean | |
"zh-CN", # Chinese (Simplified) | |
"ar-SA", # Arabic | |
] | |
def get_provider_name(self) -> str: | |
"""Get provider name""" | |
return "google" | |
def _get_encoding(self, encoding_str: str): | |
"""Convert encoding string to Google Speech enum""" | |
if not GOOGLE_SPEECH_AVAILABLE: | |
return None | |
encoding_map = { | |
"WEBM_OPUS": speech.RecognitionConfig.AudioEncoding.WEBM_OPUS, | |
"LINEAR16": speech.RecognitionConfig.AudioEncoding.LINEAR16, | |
"FLAC": speech.RecognitionConfig.AudioEncoding.FLAC, | |
"MP3": speech.RecognitionConfig.AudioEncoding.MP3, | |
"OGG_OPUS": speech.RecognitionConfig.AudioEncoding.OGG_OPUS, | |
} | |
return encoding_map.get(encoding_str, speech.RecognitionConfig.AudioEncoding.WEBM_OPUS) |