Spaces:
Building
Building
File size: 5,438 Bytes
b0a4866 c51c470 b0a4866 c51c470 b0a4866 c51c470 b0a4866 c51c470 b0a4866 c51c470 b0a4866 c51c470 b0a4866 c51c470 b0a4866 c51c470 b0a4866 c51c470 b0a4866 c51c470 b0a4866 c51c470 b0a4866 c51c470 b0a4866 c51c470 b0a4866 c51c470 b0a4866 c51c470 b0a4866 c51c470 b0a4866 c51c470 b0a4866 c51c470 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
"""
Google Cloud Speech-to-Text Implementation
"""
import os
import asyncio
from typing import AsyncIterator, Optional, List
from datetime import datetime
import sys
def log(message: str):
timestamp = datetime.now().strftime("%H:%M:%S.%f")[:-3]
print(f"[{timestamp}] {message}")
sys.stdout.flush()
# Import Google Cloud Speech only if available
try:
from google.cloud import speech_v1p1beta1 as speech
from google.api_core import exceptions
GOOGLE_SPEECH_AVAILABLE = True
except ImportError:
GOOGLE_SPEECH_AVAILABLE = False
log("β οΈ Google Cloud Speech library not installed")
from stt_interface import STTInterface, STTConfig, TranscriptionResult
class GoogleCloudSTT(STTInterface):
"""Google Cloud Speech-to-Text implementation"""
def __init__(self, credentials_path: str):
if not GOOGLE_SPEECH_AVAILABLE:
raise ImportError("google-cloud-speech library not installed. Run: pip install google-cloud-speech")
if credentials_path and os.path.exists(credentials_path):
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credentials_path
log(f"β
Google credentials set from: {credentials_path}")
else:
log("β οΈ Google credentials path not found, using default credentials")
self.client = speech.SpeechAsyncClient()
self.streaming_config = None
self.is_streaming = False
self.audio_queue = asyncio.Queue()
async def start_streaming(self, config: STTConfig) -> None:
"""Initialize streaming session"""
try:
recognition_config = speech.RecognitionConfig(
encoding=self._get_encoding(config.encoding),
sample_rate_hertz=config.sample_rate,
language_code=config.language,
enable_automatic_punctuation=config.enable_punctuation,
enable_word_time_offsets=config.enable_word_timestamps,
model=config.model,
use_enhanced=config.use_enhanced
)
self.streaming_config = speech.StreamingRecognitionConfig(
config=recognition_config,
interim_results=config.interim_results,
single_utterance=config.single_utterance
)
self.is_streaming = True
log("β
Google STT streaming started")
except Exception as e:
log(f"β Failed to start Google STT streaming: {e}")
raise
async def stream_audio(self, audio_chunk: bytes) -> AsyncIterator[TranscriptionResult]:
"""Stream audio chunk and get transcription results"""
if not self.is_streaming:
raise RuntimeError("Streaming not started. Call start_streaming() first.")
try:
# Add audio to queue
await self.audio_queue.put(audio_chunk)
# Process with Google STT
request = speech.StreamingRecognizeRequest(audio_content=audio_chunk)
# This is a simplified version - actual implementation would need
# proper streaming handling with Google's API
# For now, return empty iterator
return
yield # Make it a generator
except Exception as e:
log(f"β Google STT streaming error: {e}")
raise
async def stop_streaming(self) -> Optional[TranscriptionResult]:
"""Stop streaming and get final result"""
if not self.is_streaming:
return None
try:
self.is_streaming = False
log("β
Google STT streaming stopped")
# Return final result if any
return None
except Exception as e:
log(f"β Failed to stop Google STT streaming: {e}")
raise
def supports_realtime(self) -> bool:
"""Google Cloud STT supports real-time streaming"""
return True
def get_supported_languages(self) -> List[str]:
"""Get list of supported language codes"""
return [
"tr-TR", # Turkish
"en-US", # English (US)
"en-GB", # English (UK)
"de-DE", # German
"fr-FR", # French
"es-ES", # Spanish
"it-IT", # Italian
"pt-BR", # Portuguese (Brazil)
"ru-RU", # Russian
"ja-JP", # Japanese
"ko-KR", # Korean
"zh-CN", # Chinese (Simplified)
"ar-SA", # Arabic
]
def get_provider_name(self) -> str:
"""Get provider name"""
return "google"
def _get_encoding(self, encoding_str: str):
"""Convert encoding string to Google Speech enum"""
if not GOOGLE_SPEECH_AVAILABLE:
return None
encoding_map = {
"WEBM_OPUS": speech.RecognitionConfig.AudioEncoding.WEBM_OPUS,
"LINEAR16": speech.RecognitionConfig.AudioEncoding.LINEAR16,
"FLAC": speech.RecognitionConfig.AudioEncoding.FLAC,
"MP3": speech.RecognitionConfig.AudioEncoding.MP3,
"OGG_OPUS": speech.RecognitionConfig.AudioEncoding.OGG_OPUS,
}
return encoding_map.get(encoding_str, speech.RecognitionConfig.AudioEncoding.WEBM_OPUS) |