flare / stt_google.py
ciyidogan's picture
Create stt_google.py
b0a4866 verified
raw
history blame
5.89 kB
"""
Google Cloud Speech-to-Text Implementation
"""
import os
import asyncio
from typing import AsyncIterator, Optional, List
from google.cloud import speech_v1p1beta1 as speech
from google.api_core import exceptions
from utils import log
from stt_interface import STTInterface, STTConfig, TranscriptionResult
class GoogleCloudSTT(STTInterface):
"""Google Cloud Speech-to-Text implementation"""
def __init__(self, credentials_path: str):
if credentials_path and os.path.exists(credentials_path):
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credentials_path
log(f"βœ… Google credentials set from: {credentials_path}")
else:
log("⚠️ Google credentials path not found, using default credentials")
self.client = speech.SpeechAsyncClient()
self.streaming_config = None
self.is_streaming = False
self.audio_queue = asyncio.Queue()
async def start_streaming(self, config: STTConfig) -> None:
"""Initialize streaming session"""
try:
recognition_config = speech.RecognitionConfig(
encoding=self._get_encoding(config.encoding),
sample_rate_hertz=config.sample_rate,
language_code=config.language,
enable_automatic_punctuation=config.enable_punctuation,
enable_word_time_offsets=config.enable_word_timestamps,
model=config.model,
use_enhanced=config.use_enhanced,
metadata=speech.RecognitionMetadata(
interaction_type=speech.RecognitionMetadata.InteractionType.VOICE_SEARCH,
recording_device_type=speech.RecognitionMetadata.RecordingDeviceType.PC,
audio_topic="general"
)
)
self.streaming_config = speech.StreamingRecognitionConfig(
config=recognition_config,
interim_results=config.interim_results,
single_utterance=config.single_utterance
)
self.is_streaming = True
log("βœ… Google STT streaming session started")
except Exception as e:
log(f"❌ Failed to start Google STT streaming: {e}")
raise
async def stream_audio(self, audio_chunk: bytes) -> AsyncIterator[TranscriptionResult]:
"""Stream audio chunk and get transcription results"""
if not self.is_streaming:
log("⚠️ STT streaming not started")
return
try:
# Add audio chunk to queue
await self.audio_queue.put(audio_chunk)
# Process audio stream
async def audio_generator():
while self.is_streaming:
chunk = await self.audio_queue.get()
yield speech.StreamingRecognizeRequest(audio_content=chunk)
# Get responses
responses = await self.client.streaming_recognize(
self.streaming_config,
audio_generator()
)
async for response in responses:
for result in response.results:
if result.alternatives:
yield TranscriptionResult(
text=result.alternatives[0].transcript,
is_final=result.is_final,
confidence=result.alternatives[0].confidence,
timestamp=asyncio.get_event_loop().time()
)
except exceptions.OutOfRange:
log("⚠️ Google STT: Exceeded maximum audio duration")
self.is_streaming = False
except Exception as e:
log(f"❌ Google STT streaming error: {e}")
raise
async def stop_streaming(self) -> Optional[TranscriptionResult]:
"""Stop streaming and get final result"""
self.is_streaming = False
log("πŸ›‘ Google STT streaming stopped")
# Process any remaining audio in queue
if not self.audio_queue.empty():
# TODO: Process remaining audio
pass
return None
def supports_realtime(self) -> bool:
"""Google Cloud Speech supports real-time streaming"""
return True
def get_supported_languages(self) -> List[str]:
"""Get list of supported language codes"""
return [
"tr-TR", # Turkish
"en-US", # English (US)
"en-GB", # English (UK)
"de-DE", # German
"fr-FR", # French
"es-ES", # Spanish
"it-IT", # Italian
"pt-BR", # Portuguese (Brazil)
"ru-RU", # Russian
"ja-JP", # Japanese
"ko-KR", # Korean
"zh-CN", # Chinese (Simplified)
]
def _get_encoding(self, encoding: str):
"""Convert encoding string to Google Cloud Speech encoding"""
encoding_map = {
"LINEAR16": speech.RecognitionConfig.AudioEncoding.LINEAR16,
"FLAC": speech.RecognitionConfig.AudioEncoding.FLAC,
"MULAW": speech.RecognitionConfig.AudioEncoding.MULAW,
"AMR": speech.RecognitionConfig.AudioEncoding.AMR,
"AMR_WB": speech.RecognitionConfig.AudioEncoding.AMR_WB,
"OGG_OPUS": speech.RecognitionConfig.AudioEncoding.OGG_OPUS,
"SPEEX_WITH_HEADER_BYTE": speech.RecognitionConfig.AudioEncoding.SPEEX_WITH_HEADER_BYTE,
"WEBM_OPUS": speech.RecognitionConfig.AudioEncoding.WEBM_OPUS,
}
return encoding_map.get(encoding, speech.RecognitionConfig.AudioEncoding.WEBM_OPUS)