flare / stt_google.py
ciyidogan's picture
Update stt_google.py
8d8ad71 verified
raw
history blame
5.39 kB
"""
Google Cloud Speech-to-Text Implementation
"""
import os
import asyncio
from typing import AsyncIterator, Optional, List
from datetime import datetime
import sys
from logger import log_info, log_error, log_debug, log_warning
# Import Google Cloud Speech only if available
try:
from google.cloud import speech_v1p1beta1 as speech
from google.api_core import exceptions
GOOGLE_SPEECH_AVAILABLE = True
except ImportError:
GOOGLE_SPEECH_AVAILABLE = False
log_info("⚠️ Google Cloud Speech library not installed")
from stt_interface import STTInterface, STTConfig, TranscriptionResult
class GoogleCloudSTT(STTInterface):
"""Google Cloud Speech-to-Text implementation"""
def __init__(self, credentials_path: str):
if not GOOGLE_SPEECH_AVAILABLE:
raise ImportError("google-cloud-speech library not installed. Run: pip install google-cloud-speech")
if credentials_path and os.path.exists(credentials_path):
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credentials_path
log_info(f"βœ… Google credentials set from: {credentials_path}")
else:
log_info("⚠️ Google credentials path not found, using default credentials")
self.client = speech.SpeechAsyncClient()
self.streaming_config = None
self.is_streaming = False
self.audio_queue = asyncio.Queue()
async def start_streaming(self, config: STTConfig) -> None:
"""Initialize streaming session"""
try:
recognition_config = speech.RecognitionConfig(
encoding=self._get_encoding(config.encoding),
sample_rate_hertz=config.sample_rate,
language_code=config.language,
enable_automatic_punctuation=config.enable_punctuation,
enable_word_time_offsets=config.enable_word_timestamps,
model=config.model,
use_enhanced=config.use_enhanced
)
self.streaming_config = speech.StreamingRecognitionConfig(
config=recognition_config,
interim_results=config.interim_results,
single_utterance=config.single_utterance
)
self.is_streaming = True
log_info("βœ… Google STT streaming started")
except Exception as e:
log_error("❌ Failed to start Google STT streaming", e)
raise
async def stream_audio(self, audio_chunk: bytes) -> AsyncIterator[TranscriptionResult]:
"""Stream audio chunk and get transcription results"""
if not self.is_streaming:
raise RuntimeError("Streaming not started. Call start_streaming() first.")
try:
# Add audio to queue
await self.audio_queue.put(audio_chunk)
# Process with Google STT
request = speech.StreamingRecognizeRequest(audio_content=audio_chunk)
# This is a simplified version - actual implementation would need
# proper streaming handling with Google's API
# For now, return empty iterator
return
yield # Make it a generator
except Exception as e:
log_error("❌ Google STT streaming error", e)
raise
async def stop_streaming(self) -> Optional[TranscriptionResult]:
"""Stop streaming and get final result"""
if not self.is_streaming:
return None
try:
self.is_streaming = False
log_info("βœ… Google STT streaming stopped")
# Return final result if any
return None
except Exception as e:
log_error("❌ Failed to stop Google STT streaming", e)
raise
def supports_realtime(self) -> bool:
"""Google Cloud STT supports real-time streaming"""
return True
def get_supported_languages(self) -> List[str]:
"""Get list of supported language codes"""
return [
"tr-TR", # Turkish
"en-US", # English (US)
"en-GB", # English (UK)
"de-DE", # German
"fr-FR", # French
"es-ES", # Spanish
"it-IT", # Italian
"pt-BR", # Portuguese (Brazil)
"ru-RU", # Russian
"ja-JP", # Japanese
"ko-KR", # Korean
"zh-CN", # Chinese (Simplified)
"ar-SA", # Arabic
]
def get_provider_name(self) -> str:
"""Get provider name"""
return "google"
def _get_encoding(self, encoding_str: str):
"""Convert encoding string to Google Speech enum"""
if not GOOGLE_SPEECH_AVAILABLE:
return None
encoding_map = {
"WEBM_OPUS": speech.RecognitionConfig.AudioEncoding.WEBM_OPUS,
"LINEAR16": speech.RecognitionConfig.AudioEncoding.LINEAR16,
"FLAC": speech.RecognitionConfig.AudioEncoding.FLAC,
"MP3": speech.RecognitionConfig.AudioEncoding.MP3,
"OGG_OPUS": speech.RecognitionConfig.AudioEncoding.OGG_OPUS,
}
return encoding_map.get(encoding_str, speech.RecognitionConfig.AudioEncoding.WEBM_OPUS)