Spaces:

UcsTurkey
/

flare

Building

File size: 5,885 Bytes

b0a4866

"""
Google Cloud Speech-to-Text Implementation
"""

import os
import asyncio
from typing import AsyncIterator, Optional, List
from google.cloud import speech_v1p1beta1 as speech
from google.api_core import exceptions
from utils import log
from stt_interface import STTInterface, STTConfig, TranscriptionResult

class GoogleCloudSTT(STTInterface):
    """Google Cloud Speech-to-Text implementation"""
    
    def __init__(self, credentials_path: str):
        if credentials_path and os.path.exists(credentials_path):
            os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credentials_path
            log(f"✅ Google credentials set from: {credentials_path}")
        else:
            log("⚠️ Google credentials path not found, using default credentials")
            
        self.client = speech.SpeechAsyncClient()
        self.streaming_config = None
        self.is_streaming = False
        self.audio_queue = asyncio.Queue()
        
    async def start_streaming(self, config: STTConfig) -> None:
        """Initialize streaming session"""
        try:
            recognition_config = speech.RecognitionConfig(
                encoding=self._get_encoding(config.encoding),
                sample_rate_hertz=config.sample_rate,
                language_code=config.language,
                enable_automatic_punctuation=config.enable_punctuation,
                enable_word_time_offsets=config.enable_word_timestamps,
                model=config.model,
                use_enhanced=config.use_enhanced,
                metadata=speech.RecognitionMetadata(
                    interaction_type=speech.RecognitionMetadata.InteractionType.VOICE_SEARCH,
                    recording_device_type=speech.RecognitionMetadata.RecordingDeviceType.PC,
                    audio_topic="general"
                )
            )
            
            self.streaming_config = speech.StreamingRecognitionConfig(
                config=recognition_config,
                interim_results=config.interim_results,
                single_utterance=config.single_utterance
            )
            
            self.is_streaming = True
            log("✅ Google STT streaming session started")
            
        except Exception as e:
            log(f"❌ Failed to start Google STT streaming: {e}")
            raise
            
    async def stream_audio(self, audio_chunk: bytes) -> AsyncIterator[TranscriptionResult]:
        """Stream audio chunk and get transcription results"""
        if not self.is_streaming:
            log("⚠️ STT streaming not started")
            return
            
        try:
            # Add audio chunk to queue
            await self.audio_queue.put(audio_chunk)
            
            # Process audio stream
            async def audio_generator():
                while self.is_streaming:
                    chunk = await self.audio_queue.get()
                    yield speech.StreamingRecognizeRequest(audio_content=chunk)
                    
            # Get responses
            responses = await self.client.streaming_recognize(
                self.streaming_config,
                audio_generator()
            )
            
            async for response in responses:
                for result in response.results:
                    if result.alternatives:
                        yield TranscriptionResult(
                            text=result.alternatives[0].transcript,
                            is_final=result.is_final,
                            confidence=result.alternatives[0].confidence,
                            timestamp=asyncio.get_event_loop().time()
                        )
                        
        except exceptions.OutOfRange:
            log("⚠️ Google STT: Exceeded maximum audio duration")
            self.is_streaming = False
        except Exception as e:
            log(f"❌ Google STT streaming error: {e}")
            raise
            
    async def stop_streaming(self) -> Optional[TranscriptionResult]:
        """Stop streaming and get final result"""
        self.is_streaming = False
        log("🛑 Google STT streaming stopped")
        
        # Process any remaining audio in queue
        if not self.audio_queue.empty():
            # TODO: Process remaining audio
            pass
            
        return None
        
    def supports_realtime(self) -> bool:
        """Google Cloud Speech supports real-time streaming"""
        return True
        
    def get_supported_languages(self) -> List[str]:
        """Get list of supported language codes"""
        return [
            "tr-TR",  # Turkish
            "en-US",  # English (US)
            "en-GB",  # English (UK)
            "de-DE",  # German
            "fr-FR",  # French
            "es-ES",  # Spanish
            "it-IT",  # Italian
            "pt-BR",  # Portuguese (Brazil)
            "ru-RU",  # Russian
            "ja-JP",  # Japanese
            "ko-KR",  # Korean
            "zh-CN",  # Chinese (Simplified)
        ]
        
    def _get_encoding(self, encoding: str):
        """Convert encoding string to Google Cloud Speech encoding"""
        encoding_map = {
            "LINEAR16": speech.RecognitionConfig.AudioEncoding.LINEAR16,
            "FLAC": speech.RecognitionConfig.AudioEncoding.FLAC,
            "MULAW": speech.RecognitionConfig.AudioEncoding.MULAW,
            "AMR": speech.RecognitionConfig.AudioEncoding.AMR,
            "AMR_WB": speech.RecognitionConfig.AudioEncoding.AMR_WB,
            "OGG_OPUS": speech.RecognitionConfig.AudioEncoding.OGG_OPUS,
            "SPEEX_WITH_HEADER_BYTE": speech.RecognitionConfig.AudioEncoding.SPEEX_WITH_HEADER_BYTE,
            "WEBM_OPUS": speech.RecognitionConfig.AudioEncoding.WEBM_OPUS,
        }
        return encoding_map.get(encoding, speech.RecognitionConfig.AudioEncoding.WEBM_OPUS)