""" Google Cloud Speech-to-Text Implementation """ import os import asyncio from typing import AsyncIterator, Optional, List from google.cloud import speech_v1p1beta1 as speech from google.api_core import exceptions from utils import log from stt_interface import STTInterface, STTConfig, TranscriptionResult class GoogleCloudSTT(STTInterface): """Google Cloud Speech-to-Text implementation""" def __init__(self, credentials_path: str): if credentials_path and os.path.exists(credentials_path): os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credentials_path log(f"✅ Google credentials set from: {credentials_path}") else: log("⚠️ Google credentials path not found, using default credentials") self.client = speech.SpeechAsyncClient() self.streaming_config = None self.is_streaming = False self.audio_queue = asyncio.Queue() async def start_streaming(self, config: STTConfig) -> None: """Initialize streaming session""" try: recognition_config = speech.RecognitionConfig( encoding=self._get_encoding(config.encoding), sample_rate_hertz=config.sample_rate, language_code=config.language, enable_automatic_punctuation=config.enable_punctuation, enable_word_time_offsets=config.enable_word_timestamps, model=config.model, use_enhanced=config.use_enhanced, metadata=speech.RecognitionMetadata( interaction_type=speech.RecognitionMetadata.InteractionType.VOICE_SEARCH, recording_device_type=speech.RecognitionMetadata.RecordingDeviceType.PC, audio_topic="general" ) ) self.streaming_config = speech.StreamingRecognitionConfig( config=recognition_config, interim_results=config.interim_results, single_utterance=config.single_utterance ) self.is_streaming = True log("✅ Google STT streaming session started") except Exception as e: log(f"❌ Failed to start Google STT streaming: {e}") raise async def stream_audio(self, audio_chunk: bytes) -> AsyncIterator[TranscriptionResult]: """Stream audio chunk and get transcription results""" if not self.is_streaming: log("⚠️ STT streaming not started") return try: # Add audio chunk to queue await self.audio_queue.put(audio_chunk) # Process audio stream async def audio_generator(): while self.is_streaming: chunk = await self.audio_queue.get() yield speech.StreamingRecognizeRequest(audio_content=chunk) # Get responses responses = await self.client.streaming_recognize( self.streaming_config, audio_generator() ) async for response in responses: for result in response.results: if result.alternatives: yield TranscriptionResult( text=result.alternatives[0].transcript, is_final=result.is_final, confidence=result.alternatives[0].confidence, timestamp=asyncio.get_event_loop().time() ) except exceptions.OutOfRange: log("⚠️ Google STT: Exceeded maximum audio duration") self.is_streaming = False except Exception as e: log(f"❌ Google STT streaming error: {e}") raise async def stop_streaming(self) -> Optional[TranscriptionResult]: """Stop streaming and get final result""" self.is_streaming = False log("🛑 Google STT streaming stopped") # Process any remaining audio in queue if not self.audio_queue.empty(): # TODO: Process remaining audio pass return None def supports_realtime(self) -> bool: """Google Cloud Speech supports real-time streaming""" return True def get_supported_languages(self) -> List[str]: """Get list of supported language codes""" return [ "tr-TR", # Turkish "en-US", # English (US) "en-GB", # English (UK) "de-DE", # German "fr-FR", # French "es-ES", # Spanish "it-IT", # Italian "pt-BR", # Portuguese (Brazil) "ru-RU", # Russian "ja-JP", # Japanese "ko-KR", # Korean "zh-CN", # Chinese (Simplified) ] def _get_encoding(self, encoding: str): """Convert encoding string to Google Cloud Speech encoding""" encoding_map = { "LINEAR16": speech.RecognitionConfig.AudioEncoding.LINEAR16, "FLAC": speech.RecognitionConfig.AudioEncoding.FLAC, "MULAW": speech.RecognitionConfig.AudioEncoding.MULAW, "AMR": speech.RecognitionConfig.AudioEncoding.AMR, "AMR_WB": speech.RecognitionConfig.AudioEncoding.AMR_WB, "OGG_OPUS": speech.RecognitionConfig.AudioEncoding.OGG_OPUS, "SPEEX_WITH_HEADER_BYTE": speech.RecognitionConfig.AudioEncoding.SPEEX_WITH_HEADER_BYTE, "WEBM_OPUS": speech.RecognitionConfig.AudioEncoding.WEBM_OPUS, } return encoding_map.get(encoding, speech.RecognitionConfig.AudioEncoding.WEBM_OPUS)