import io import os import types import wave import speech_recognition as sr import whisper from pydub import AudioSegment from realtime_ai_character.audio.speech_to_text.base import SpeechToText from realtime_ai_character.logger import get_logger from realtime_ai_character.utils import Singleton DEBUG = False logger = get_logger(__name__) config = types.SimpleNamespace(**{ 'model': 'tiny', 'language': 'en', 'api_key': os.getenv("OPENAI_API_KEY"), }) class Whisper(Singleton, SpeechToText): def __init__(self, use='local'): super().__init__() if use == 'local': logger.info(f"Loading [Local Whisper] model: [{config.model}]...") whisper.load_model(config.model) self.recognizer = sr.Recognizer() self.use = use if DEBUG: self.wf = wave.open('output.wav', 'wb') self.wf.setnchannels(1) # Assuming mono audio self.wf.setsampwidth(2) # Assuming 16-bit audio self.wf.setframerate(44100) # Assuming 44100Hz sample rate def transcribe(self, audio_bytes, platform, prompt=''): logger.info("Transcribing audio...") if platform == 'web': audio = self._convert_webm_to_wav(audio_bytes) else: audio = sr.AudioData(audio_bytes, 44100, 2) if self.use == 'local': return self._transcribe(audio, prompt) elif self.use == 'api': return self._transcribe_api(audio, prompt) def _transcribe(self, audio, prompt=''): text = self.recognizer.recognize_whisper( audio, model=config.model, language=config.language, show_dict=True, initial_prompt=prompt )['text'] return text def _transcribe_api(self, audio, prompt=''): text = self.recognizer.recognize_whisper_api( audio, api_key=config.api_key, ) return text def _convert_webm_to_wav(self, webm_data): webm_audio = AudioSegment.from_file( io.BytesIO(webm_data), format="webm") wav_data = io.BytesIO() webm_audio.export(wav_data, format="wav") with sr.AudioFile(wav_data) as source: audio = self.recognizer.record(source) return audio