Spaces:

UcsTurkey
/

flare

Building

App Files Files Community

flare / stt /stt_deepgram.py

ciyidogan

Update stt/stt_deepgram.py

911913b verified 6 days ago

raw

history blame contribute delete

7.21 kB

	"""
	Deepgram Speech-to-Text Implementation - Simple Batch Mode
	"""
	from typing import Optional, List
	from datetime import datetime
	import io
	import wave
	import aiohttp
	import json
	from utils.logger import log_info, log_error, log_debug, log_warning
	from .stt_interface import STTInterface, STTConfig, TranscriptionResult


	class DeepgramSTT(STTInterface):
	def __init__(self, api_key: str):
	"""
	Initialize Deepgram STT
	Args:
	api_key: Deepgram API key
	"""
	try:
	self.api_key = api_key
	self.base_url = "https://api.deepgram.com/v1/listen"

	log_info("✅ Deepgram STT initialized in batch mode")

	except Exception as e:
	log_error(f"❌ Failed to initialize Deepgram STT: {str(e)}")
	raise

	def _map_language_code(self, language: str) -> str:
	"""Map language codes to Deepgram format"""
	# Deepgram uses different language codes
	language_map = {
	"tr": "tr",
	"tr-TR": "tr",
	"en": "en-US",
	"en-US": "en-US",
	"en-GB": "en-GB",
	"de": "de",
	"de-DE": "de",
	"fr": "fr",
	"fr-FR": "fr",
	"es": "es",
	"es-ES": "es",
	"it": "it",
	"it-IT": "it",
	"pt": "pt-BR",
	"pt-BR": "pt-BR",
	"ru": "ru",
	"ru-RU": "ru",
	"ja": "ja",
	"ja-JP": "ja",
	"ko": "ko",
	"ko-KR": "ko",
	"zh": "zh-CN",
	"zh-CN": "zh-CN",
	"ar": "ar",
	"ar-SA": "ar",
	}

	# Default to the language itself if not in map
	return language_map.get(language, language)

	async def transcribe(self, audio_data: bytes, config: STTConfig) -> Optional[TranscriptionResult]:
	"""Transcribe audio data using Deepgram API"""
	try:
	# Check if we have audio to transcribe
	if not audio_data:
	log_warning("⚠️ No audio data provided")
	return None

	log_info(f"📊 Transcribing {len(audio_data)} bytes of audio")

	# Convert to WAV format for better compatibility
	wav_audio = self._convert_to_wav(audio_data, config.sample_rate)

	# Build Deepgram API parameters
	language = self._map_language_code(config.language)

	params = {
	"language": language,
	"punctuate": str(config.enable_punctuation).lower(),
	"model": config.model if config.model != "latest_long" else "general",
	"tier": "enhanced" if config.use_enhanced else "base",
	}

	# Add word timestamps if requested
	if config.enable_word_timestamps:
	params["utterances"] = "true"
	params["words"] = "true"

	# Build URL with parameters
	url = f"{self.base_url}?" + "&".join([f"{k}={v}" for k, v in params.items()])

	# Prepare headers
	headers = {
	"Authorization": f"Token {self.api_key}",
	"Content-Type": "audio/wav"
	}

	# Make API request
	log_info(f"🔄 Sending audio to Deepgram API...")
	async with aiohttp.ClientSession() as session:
	async with session.post(url, headers=headers, data=wav_audio) as response:
	if response.status == 200:
	result = await response.json()

	# Extract transcription from response
	if result.get("results") and result["results"].get("channels"):
	channel = result["results"]["channels"][0]
	if channel.get("alternatives"):
	alternative = channel["alternatives"][0]

	# Extract word timestamps if available
	word_timestamps = None
	if config.enable_word_timestamps and alternative.get("words"):
	word_timestamps = [
	{
	"word": word["word"],
	"start_time": word["start"],
	"end_time": word["end"]
	}
	for word in alternative["words"]
	]

	transcription = TranscriptionResult(
	text=alternative.get("transcript", ""),
	confidence=alternative.get("confidence", 0.0),
	timestamp=datetime.now().timestamp(),
	language=language,
	word_timestamps=word_timestamps
	)

	log_info(f"✅ Transcription: '{transcription.text}' (confidence: {transcription.confidence:.2f})")
	return transcription

	log_warning("⚠️ No transcription in response")
	return None
	else:
	error_text = await response.text()
	log_error(f"❌ Deepgram API error: {response.status} - {error_text}")
	return None

	except Exception as e:
	log_error(f"❌ Error during transcription: {str(e)}")
	import traceback
	log_error(f"Traceback: {traceback.format_exc()}")
	return None

	def _convert_to_wav(self, audio_data: bytes, sample_rate: int) -> bytes:
	"""Convert raw PCM audio to WAV format"""
	# Create WAV file in memory
	wav_buffer = io.BytesIO()

	with wave.open(wav_buffer, 'wb') as wav_file:
	# Set WAV parameters
	wav_file.setnchannels(1) # Mono
	wav_file.setsampwidth(2) # 16-bit
	wav_file.setframerate(sample_rate)
	wav_file.writeframes(audio_data)

	# Get WAV data
	wav_buffer.seek(0)
	return wav_buffer.read()

	def get_supported_languages(self) -> List[str]:
	"""Get list of supported language codes"""
	# Deepgram'ın desteklediği dil kodları
	# Kaynak: https://developers.deepgram.com/docs/models-languages
	return [
	"en", "es", "fr", "de", "it", "pt", "ru", "tr", "pl", "nl",
	"sv", "no", "fi", "da", "ja", "ko", "zh", "hi", "id", "th",
	"uk", "cs", "el", "he", "ar", "fa", "ta", "tl"
	]

	def get_provider_name(self) -> str:
	"""Get provider name"""
	return "deepgram"