flare / stt /stt_deepgram.py
ciyidogan's picture
Update stt/stt_deepgram.py
911913b verified
"""
Deepgram Speech-to-Text Implementation - Simple Batch Mode
"""
from typing import Optional, List
from datetime import datetime
import io
import wave
import aiohttp
import json
from utils.logger import log_info, log_error, log_debug, log_warning
from .stt_interface import STTInterface, STTConfig, TranscriptionResult
class DeepgramSTT(STTInterface):
def __init__(self, api_key: str):
"""
Initialize Deepgram STT
Args:
api_key: Deepgram API key
"""
try:
self.api_key = api_key
self.base_url = "https://api.deepgram.com/v1/listen"
log_info("✅ Deepgram STT initialized in batch mode")
except Exception as e:
log_error(f"❌ Failed to initialize Deepgram STT: {str(e)}")
raise
def _map_language_code(self, language: str) -> str:
"""Map language codes to Deepgram format"""
# Deepgram uses different language codes
language_map = {
"tr": "tr",
"tr-TR": "tr",
"en": "en-US",
"en-US": "en-US",
"en-GB": "en-GB",
"de": "de",
"de-DE": "de",
"fr": "fr",
"fr-FR": "fr",
"es": "es",
"es-ES": "es",
"it": "it",
"it-IT": "it",
"pt": "pt-BR",
"pt-BR": "pt-BR",
"ru": "ru",
"ru-RU": "ru",
"ja": "ja",
"ja-JP": "ja",
"ko": "ko",
"ko-KR": "ko",
"zh": "zh-CN",
"zh-CN": "zh-CN",
"ar": "ar",
"ar-SA": "ar",
}
# Default to the language itself if not in map
return language_map.get(language, language)
async def transcribe(self, audio_data: bytes, config: STTConfig) -> Optional[TranscriptionResult]:
"""Transcribe audio data using Deepgram API"""
try:
# Check if we have audio to transcribe
if not audio_data:
log_warning("⚠️ No audio data provided")
return None
log_info(f"📊 Transcribing {len(audio_data)} bytes of audio")
# Convert to WAV format for better compatibility
wav_audio = self._convert_to_wav(audio_data, config.sample_rate)
# Build Deepgram API parameters
language = self._map_language_code(config.language)
params = {
"language": language,
"punctuate": str(config.enable_punctuation).lower(),
"model": config.model if config.model != "latest_long" else "general",
"tier": "enhanced" if config.use_enhanced else "base",
}
# Add word timestamps if requested
if config.enable_word_timestamps:
params["utterances"] = "true"
params["words"] = "true"
# Build URL with parameters
url = f"{self.base_url}?" + "&".join([f"{k}={v}" for k, v in params.items()])
# Prepare headers
headers = {
"Authorization": f"Token {self.api_key}",
"Content-Type": "audio/wav"
}
# Make API request
log_info(f"🔄 Sending audio to Deepgram API...")
async with aiohttp.ClientSession() as session:
async with session.post(url, headers=headers, data=wav_audio) as response:
if response.status == 200:
result = await response.json()
# Extract transcription from response
if result.get("results") and result["results"].get("channels"):
channel = result["results"]["channels"][0]
if channel.get("alternatives"):
alternative = channel["alternatives"][0]
# Extract word timestamps if available
word_timestamps = None
if config.enable_word_timestamps and alternative.get("words"):
word_timestamps = [
{
"word": word["word"],
"start_time": word["start"],
"end_time": word["end"]
}
for word in alternative["words"]
]
transcription = TranscriptionResult(
text=alternative.get("transcript", ""),
confidence=alternative.get("confidence", 0.0),
timestamp=datetime.now().timestamp(),
language=language,
word_timestamps=word_timestamps
)
log_info(f"✅ Transcription: '{transcription.text}' (confidence: {transcription.confidence:.2f})")
return transcription
log_warning("⚠️ No transcription in response")
return None
else:
error_text = await response.text()
log_error(f"❌ Deepgram API error: {response.status} - {error_text}")
return None
except Exception as e:
log_error(f"❌ Error during transcription: {str(e)}")
import traceback
log_error(f"Traceback: {traceback.format_exc()}")
return None
def _convert_to_wav(self, audio_data: bytes, sample_rate: int) -> bytes:
"""Convert raw PCM audio to WAV format"""
# Create WAV file in memory
wav_buffer = io.BytesIO()
with wave.open(wav_buffer, 'wb') as wav_file:
# Set WAV parameters
wav_file.setnchannels(1) # Mono
wav_file.setsampwidth(2) # 16-bit
wav_file.setframerate(sample_rate)
wav_file.writeframes(audio_data)
# Get WAV data
wav_buffer.seek(0)
return wav_buffer.read()
def get_supported_languages(self) -> List[str]:
"""Get list of supported language codes"""
# Deepgram'ın desteklediği dil kodları
# Kaynak: https://developers.deepgram.com/docs/models-languages
return [
"en", "es", "fr", "de", "it", "pt", "ru", "tr", "pl", "nl",
"sv", "no", "fi", "da", "ja", "ko", "zh", "hi", "id", "th",
"uk", "cs", "el", "he", "ar", "fa", "ta", "tl"
]
def get_provider_name(self) -> str:
"""Get provider name"""
return "deepgram"