Spaces:

UcsTurkey
/

flare

Building

App Files Files Community

flare / stt_google.py

ciyidogan

Create stt_google.py

b0a4866 verified about 1 month ago

raw

history blame

5.89 kB

	"""
	Google Cloud Speech-to-Text Implementation
	"""

	import os
	import asyncio
	from typing import AsyncIterator, Optional, List
	from google.cloud import speech_v1p1beta1 as speech
	from google.api_core import exceptions
	from utils import log
	from stt_interface import STTInterface, STTConfig, TranscriptionResult

	class GoogleCloudSTT(STTInterface):
	"""Google Cloud Speech-to-Text implementation"""

	def __init__(self, credentials_path: str):
	if credentials_path and os.path.exists(credentials_path):
	os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credentials_path
	log(f"✅ Google credentials set from: {credentials_path}")
	else:
	log("⚠️ Google credentials path not found, using default credentials")

	self.client = speech.SpeechAsyncClient()
	self.streaming_config = None
	self.is_streaming = False
	self.audio_queue = asyncio.Queue()

	async def start_streaming(self, config: STTConfig) -> None:
	"""Initialize streaming session"""
	try:
	recognition_config = speech.RecognitionConfig(
	encoding=self._get_encoding(config.encoding),
	sample_rate_hertz=config.sample_rate,
	language_code=config.language,
	enable_automatic_punctuation=config.enable_punctuation,
	enable_word_time_offsets=config.enable_word_timestamps,
	model=config.model,
	use_enhanced=config.use_enhanced,
	metadata=speech.RecognitionMetadata(
	interaction_type=speech.RecognitionMetadata.InteractionType.VOICE_SEARCH,
	recording_device_type=speech.RecognitionMetadata.RecordingDeviceType.PC,
	audio_topic="general"
	)
	)

	self.streaming_config = speech.StreamingRecognitionConfig(
	config=recognition_config,
	interim_results=config.interim_results,
	single_utterance=config.single_utterance
	)

	self.is_streaming = True
	log("✅ Google STT streaming session started")

	except Exception as e:
	log(f"❌ Failed to start Google STT streaming: {e}")
	raise

	async def stream_audio(self, audio_chunk: bytes) -> AsyncIterator[TranscriptionResult]:
	"""Stream audio chunk and get transcription results"""
	if not self.is_streaming:
	log("⚠️ STT streaming not started")
	return

	try:
	# Add audio chunk to queue
	await self.audio_queue.put(audio_chunk)

	# Process audio stream
	async def audio_generator():
	while self.is_streaming:
	chunk = await self.audio_queue.get()
	yield speech.StreamingRecognizeRequest(audio_content=chunk)

	# Get responses
	responses = await self.client.streaming_recognize(
	self.streaming_config,
	audio_generator()
	)

	async for response in responses:
	for result in response.results:
	if result.alternatives:
	yield TranscriptionResult(
	text=result.alternatives[0].transcript,
	is_final=result.is_final,
	confidence=result.alternatives[0].confidence,
	timestamp=asyncio.get_event_loop().time()
	)

	except exceptions.OutOfRange:
	log("⚠️ Google STT: Exceeded maximum audio duration")
	self.is_streaming = False
	except Exception as e:
	log(f"❌ Google STT streaming error: {e}")
	raise

	async def stop_streaming(self) -> Optional[TranscriptionResult]:
	"""Stop streaming and get final result"""
	self.is_streaming = False
	log("🛑 Google STT streaming stopped")

	# Process any remaining audio in queue
	if not self.audio_queue.empty():
	# TODO: Process remaining audio
	pass

	return None

	def supports_realtime(self) -> bool:
	"""Google Cloud Speech supports real-time streaming"""
	return True

	def get_supported_languages(self) -> List[str]:
	"""Get list of supported language codes"""
	return [
	"tr-TR", # Turkish
	"en-US", # English (US)
	"en-GB", # English (UK)
	"de-DE", # German
	"fr-FR", # French
	"es-ES", # Spanish
	"it-IT", # Italian
	"pt-BR", # Portuguese (Brazil)
	"ru-RU", # Russian
	"ja-JP", # Japanese
	"ko-KR", # Korean
	"zh-CN", # Chinese (Simplified)
	]

	def _get_encoding(self, encoding: str):
	"""Convert encoding string to Google Cloud Speech encoding"""
	encoding_map = {
	"LINEAR16": speech.RecognitionConfig.AudioEncoding.LINEAR16,
	"FLAC": speech.RecognitionConfig.AudioEncoding.FLAC,
	"MULAW": speech.RecognitionConfig.AudioEncoding.MULAW,
	"AMR": speech.RecognitionConfig.AudioEncoding.AMR,
	"AMR_WB": speech.RecognitionConfig.AudioEncoding.AMR_WB,
	"OGG_OPUS": speech.RecognitionConfig.AudioEncoding.OGG_OPUS,
	"SPEEX_WITH_HEADER_BYTE": speech.RecognitionConfig.AudioEncoding.SPEEX_WITH_HEADER_BYTE,
	"WEBM_OPUS": speech.RecognitionConfig.AudioEncoding.WEBM_OPUS,
	}
	return encoding_map.get(encoding, speech.RecognitionConfig.AudioEncoding.WEBM_OPUS)