Spaces:
Building
Building
""" | |
Google Cloud Speech-to-Text Implementation | |
""" | |
import os | |
import asyncio | |
from typing import AsyncIterator, Optional, List | |
from datetime import datetime | |
import sys | |
from logger import log_info, log_error, log_debug, log_warning | |
# Import Google Cloud Speech only if available | |
try: | |
from google.cloud import speech_v1p1beta1 as speech | |
from google.api_core import exceptions | |
GOOGLE_SPEECH_AVAILABLE = True | |
except ImportError: | |
GOOGLE_SPEECH_AVAILABLE = False | |
log_info("β οΈ Google Cloud Speech library not installed") | |
from stt_interface import STTInterface, STTConfig, TranscriptionResult | |
class GoogleCloudSTT(STTInterface): | |
"""Google Cloud Speech-to-Text implementation""" | |
def __init__(self, credentials_path: str): | |
if not GOOGLE_SPEECH_AVAILABLE: | |
raise ImportError("google-cloud-speech library not installed. Run: pip install google-cloud-speech") | |
if credentials_path and os.path.exists(credentials_path): | |
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credentials_path | |
log_info(f"β Google credentials set from: {credentials_path}") | |
else: | |
log_info("β οΈ Google credentials path not found, using default credentials") | |
self.client = speech.SpeechAsyncClient() | |
self.streaming_config = None | |
self.is_streaming = False | |
self.audio_queue = asyncio.Queue() | |
async def start_streaming(self, config: STTConfig) -> None: | |
"""Initialize streaming session""" | |
try: | |
recognition_config = speech.RecognitionConfig( | |
encoding=self._get_encoding(config.encoding), | |
sample_rate_hertz=config.sample_rate, | |
language_code=config.language, | |
enable_automatic_punctuation=config.enable_punctuation, | |
enable_word_time_offsets=config.enable_word_timestamps, | |
model=config.model, | |
use_enhanced=config.use_enhanced | |
) | |
self.streaming_config = speech.StreamingRecognitionConfig( | |
config=recognition_config, | |
interim_results=config.interim_results, | |
single_utterance=config.single_utterance | |
) | |
self.is_streaming = True | |
log_info("β Google STT streaming started") | |
except Exception as e: | |
log_error("β Failed to start Google STT streaming", e) | |
raise | |
async def stream_audio(self, audio_chunk: bytes) -> AsyncIterator[TranscriptionResult]: | |
"""Stream audio chunk and get transcription results""" | |
if not self.is_streaming: | |
raise RuntimeError("Streaming not started. Call start_streaming() first.") | |
try: | |
# Add audio to queue | |
await self.audio_queue.put(audio_chunk) | |
# Process with Google STT | |
request = speech.StreamingRecognizeRequest(audio_content=audio_chunk) | |
# This is a simplified version - actual implementation would need | |
# proper streaming handling with Google's API | |
# For now, return empty iterator | |
return | |
yield # Make it a generator | |
except Exception as e: | |
log_error("β Google STT streaming error", e) | |
raise | |
async def stop_streaming(self) -> Optional[TranscriptionResult]: | |
"""Stop streaming and get final result""" | |
if not self.is_streaming: | |
return None | |
try: | |
self.is_streaming = False | |
log_info("β Google STT streaming stopped") | |
# Return final result if any | |
return None | |
except Exception as e: | |
log_error("β Failed to stop Google STT streaming", e) | |
raise | |
def supports_realtime(self) -> bool: | |
"""Google Cloud STT supports real-time streaming""" | |
return True | |
def get_supported_languages(self) -> List[str]: | |
"""Get list of supported language codes""" | |
return [ | |
"tr-TR", # Turkish | |
"en-US", # English (US) | |
"en-GB", # English (UK) | |
"de-DE", # German | |
"fr-FR", # French | |
"es-ES", # Spanish | |
"it-IT", # Italian | |
"pt-BR", # Portuguese (Brazil) | |
"ru-RU", # Russian | |
"ja-JP", # Japanese | |
"ko-KR", # Korean | |
"zh-CN", # Chinese (Simplified) | |
"ar-SA", # Arabic | |
] | |
def get_provider_name(self) -> str: | |
"""Get provider name""" | |
return "google" | |
def _get_encoding(self, encoding_str: str): | |
"""Convert encoding string to Google Speech enum""" | |
if not GOOGLE_SPEECH_AVAILABLE: | |
return None | |
encoding_map = { | |
"WEBM_OPUS": speech.RecognitionConfig.AudioEncoding.WEBM_OPUS, | |
"LINEAR16": speech.RecognitionConfig.AudioEncoding.LINEAR16, | |
"FLAC": speech.RecognitionConfig.AudioEncoding.FLAC, | |
"MP3": speech.RecognitionConfig.AudioEncoding.MP3, | |
"OGG_OPUS": speech.RecognitionConfig.AudioEncoding.OGG_OPUS, | |
} | |
return encoding_map.get(encoding_str, speech.RecognitionConfig.AudioEncoding.WEBM_OPUS) |