Spaces:
Building
Building
""" | |
Deepgram Speech-to-Text Implementation - Simple Batch Mode | |
""" | |
from typing import Optional, List | |
from datetime import datetime | |
import io | |
import wave | |
import aiohttp | |
import json | |
from utils.logger import log_info, log_error, log_debug, log_warning | |
from .stt_interface import STTInterface, STTConfig, TranscriptionResult | |
class DeepgramSTT(STTInterface): | |
def __init__(self, api_key: str): | |
""" | |
Initialize Deepgram STT | |
Args: | |
api_key: Deepgram API key | |
""" | |
try: | |
self.api_key = api_key | |
self.base_url = "https://api.deepgram.com/v1/listen" | |
log_info("✅ Deepgram STT initialized in batch mode") | |
except Exception as e: | |
log_error(f"❌ Failed to initialize Deepgram STT: {str(e)}") | |
raise | |
def _map_language_code(self, language: str) -> str: | |
"""Map language codes to Deepgram format""" | |
# Deepgram uses different language codes | |
language_map = { | |
"tr": "tr", | |
"tr-TR": "tr", | |
"en": "en-US", | |
"en-US": "en-US", | |
"en-GB": "en-GB", | |
"de": "de", | |
"de-DE": "de", | |
"fr": "fr", | |
"fr-FR": "fr", | |
"es": "es", | |
"es-ES": "es", | |
"it": "it", | |
"it-IT": "it", | |
"pt": "pt-BR", | |
"pt-BR": "pt-BR", | |
"ru": "ru", | |
"ru-RU": "ru", | |
"ja": "ja", | |
"ja-JP": "ja", | |
"ko": "ko", | |
"ko-KR": "ko", | |
"zh": "zh-CN", | |
"zh-CN": "zh-CN", | |
"ar": "ar", | |
"ar-SA": "ar", | |
} | |
# Default to the language itself if not in map | |
return language_map.get(language, language) | |
async def transcribe(self, audio_data: bytes, config: STTConfig) -> Optional[TranscriptionResult]: | |
"""Transcribe audio data using Deepgram API""" | |
try: | |
# Check if we have audio to transcribe | |
if not audio_data: | |
log_warning("⚠️ No audio data provided") | |
return None | |
log_info(f"📊 Transcribing {len(audio_data)} bytes of audio") | |
# Convert to WAV format for better compatibility | |
wav_audio = self._convert_to_wav(audio_data, config.sample_rate) | |
# Build Deepgram API parameters | |
language = self._map_language_code(config.language) | |
params = { | |
"language": language, | |
"punctuate": str(config.enable_punctuation).lower(), | |
"model": config.model if config.model != "latest_long" else "general", | |
"tier": "enhanced" if config.use_enhanced else "base", | |
} | |
# Add word timestamps if requested | |
if config.enable_word_timestamps: | |
params["utterances"] = "true" | |
params["words"] = "true" | |
# Build URL with parameters | |
url = f"{self.base_url}?" + "&".join([f"{k}={v}" for k, v in params.items()]) | |
# Prepare headers | |
headers = { | |
"Authorization": f"Token {self.api_key}", | |
"Content-Type": "audio/wav" | |
} | |
# Make API request | |
log_info(f"🔄 Sending audio to Deepgram API...") | |
async with aiohttp.ClientSession() as session: | |
async with session.post(url, headers=headers, data=wav_audio) as response: | |
if response.status == 200: | |
result = await response.json() | |
# Extract transcription from response | |
if result.get("results") and result["results"].get("channels"): | |
channel = result["results"]["channels"][0] | |
if channel.get("alternatives"): | |
alternative = channel["alternatives"][0] | |
# Extract word timestamps if available | |
word_timestamps = None | |
if config.enable_word_timestamps and alternative.get("words"): | |
word_timestamps = [ | |
{ | |
"word": word["word"], | |
"start_time": word["start"], | |
"end_time": word["end"] | |
} | |
for word in alternative["words"] | |
] | |
transcription = TranscriptionResult( | |
text=alternative.get("transcript", ""), | |
confidence=alternative.get("confidence", 0.0), | |
timestamp=datetime.now().timestamp(), | |
language=language, | |
word_timestamps=word_timestamps | |
) | |
log_info(f"✅ Transcription: '{transcription.text}' (confidence: {transcription.confidence:.2f})") | |
return transcription | |
log_warning("⚠️ No transcription in response") | |
return None | |
else: | |
error_text = await response.text() | |
log_error(f"❌ Deepgram API error: {response.status} - {error_text}") | |
return None | |
except Exception as e: | |
log_error(f"❌ Error during transcription: {str(e)}") | |
import traceback | |
log_error(f"Traceback: {traceback.format_exc()}") | |
return None | |
def _convert_to_wav(self, audio_data: bytes, sample_rate: int) -> bytes: | |
"""Convert raw PCM audio to WAV format""" | |
# Create WAV file in memory | |
wav_buffer = io.BytesIO() | |
with wave.open(wav_buffer, 'wb') as wav_file: | |
# Set WAV parameters | |
wav_file.setnchannels(1) # Mono | |
wav_file.setsampwidth(2) # 16-bit | |
wav_file.setframerate(sample_rate) | |
wav_file.writeframes(audio_data) | |
# Get WAV data | |
wav_buffer.seek(0) | |
return wav_buffer.read() | |
def get_supported_languages(self) -> List[str]: | |
"""Get list of supported language codes""" | |
# Deepgram'ın desteklediği dil kodları | |
# Kaynak: https://developers.deepgram.com/docs/models-languages | |
return [ | |
"en", "es", "fr", "de", "it", "pt", "ru", "tr", "pl", "nl", | |
"sv", "no", "fi", "da", "ja", "ko", "zh", "hi", "id", "th", | |
"uk", "cs", "el", "he", "ar", "fa", "ta", "tl" | |
] | |
def get_provider_name(self) -> str: | |
"""Get provider name""" | |
return "deepgram" |