Spaces:

UcsTurkey
/

flare

Building

File size: 5,390 Bytes

"""
Google Cloud Speech-to-Text Implementation
"""
import os
import asyncio
from typing import AsyncIterator, Optional, List
from datetime import datetime
import sys
from logger import log_info, log_error, log_debug, log_warning

# Import Google Cloud Speech only if available
try:
    from google.cloud import speech_v1p1beta1 as speech
    from google.api_core import exceptions
    GOOGLE_SPEECH_AVAILABLE = True
except ImportError:
    GOOGLE_SPEECH_AVAILABLE = False
    log_info("⚠️ Google Cloud Speech library not installed")

from stt_interface import STTInterface, STTConfig, TranscriptionResult

class GoogleCloudSTT(STTInterface):
    """Google Cloud Speech-to-Text implementation"""
    
    def __init__(self, credentials_path: str):
        if not GOOGLE_SPEECH_AVAILABLE:
            raise ImportError("google-cloud-speech library not installed. Run: pip install google-cloud-speech")
            
        if credentials_path and os.path.exists(credentials_path):
            os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credentials_path
            log_info(f"✅ Google credentials set from: {credentials_path}")
        else:
            log_info("⚠️ Google credentials path not found, using default credentials")
            
        self.client = speech.SpeechAsyncClient()
        self.streaming_config = None
        self.is_streaming = False
        self.audio_queue = asyncio.Queue()
        
    async def start_streaming(self, config: STTConfig) -> None:
        """Initialize streaming session"""
        try:
            recognition_config = speech.RecognitionConfig(
                encoding=self._get_encoding(config.encoding),
                sample_rate_hertz=config.sample_rate,
                language_code=config.language,
                enable_automatic_punctuation=config.enable_punctuation,
                enable_word_time_offsets=config.enable_word_timestamps,
                model=config.model,
                use_enhanced=config.use_enhanced
            )
            
            self.streaming_config = speech.StreamingRecognitionConfig(
                config=recognition_config,
                interim_results=config.interim_results,
                single_utterance=config.single_utterance
            )
            
            self.is_streaming = True
            log_info("✅ Google STT streaming started")
            
        except Exception as e:
            log_error("❌ Failed to start Google STT streaming", e)
            raise
    
    async def stream_audio(self, audio_chunk: bytes) -> AsyncIterator[TranscriptionResult]:
        """Stream audio chunk and get transcription results"""
        if not self.is_streaming:
            raise RuntimeError("Streaming not started. Call start_streaming() first.")
        
        try:
            # Add audio to queue
            await self.audio_queue.put(audio_chunk)
            
            # Process with Google STT
            request = speech.StreamingRecognizeRequest(audio_content=audio_chunk)
            
            # This is a simplified version - actual implementation would need
            # proper streaming handling with Google's API
            # For now, return empty iterator
            return
            yield  # Make it a generator
            
        except Exception as e:
            log_error("❌ Google STT streaming error", e)
            raise
    
    async def stop_streaming(self) -> Optional[TranscriptionResult]:
        """Stop streaming and get final result"""
        if not self.is_streaming:
            return None
            
        try:
            self.is_streaming = False
            log_info("✅ Google STT streaming stopped")
            
            # Return final result if any
            return None
            
        except Exception as e:
            log_error("❌ Failed to stop Google STT streaming", e)
            raise
    
    def supports_realtime(self) -> bool:
        """Google Cloud STT supports real-time streaming"""
        return True
    
    def get_supported_languages(self) -> List[str]:
        """Get list of supported language codes"""
        return [
            "tr-TR",  # Turkish
            "en-US",  # English (US)
            "en-GB",  # English (UK)
            "de-DE",  # German
            "fr-FR",  # French
            "es-ES",  # Spanish
            "it-IT",  # Italian
            "pt-BR",  # Portuguese (Brazil)
            "ru-RU",  # Russian
            "ja-JP",  # Japanese
            "ko-KR",  # Korean
            "zh-CN",  # Chinese (Simplified)
            "ar-SA",  # Arabic
        ]
    
    def get_provider_name(self) -> str:
        """Get provider name"""
        return "google"
    
    def _get_encoding(self, encoding_str: str):
        """Convert encoding string to Google Speech enum"""
        if not GOOGLE_SPEECH_AVAILABLE:
            return None
            
        encoding_map = {
            "WEBM_OPUS": speech.RecognitionConfig.AudioEncoding.WEBM_OPUS,
            "LINEAR16": speech.RecognitionConfig.AudioEncoding.LINEAR16,
            "FLAC": speech.RecognitionConfig.AudioEncoding.FLAC,
            "MP3": speech.RecognitionConfig.AudioEncoding.MP3,
            "OGG_OPUS": speech.RecognitionConfig.AudioEncoding.OGG_OPUS,
        }
        return encoding_map.get(encoding_str, speech.RecognitionConfig.AudioEncoding.WEBM_OPUS)