|
from google.cloud import speech |
|
import types |
|
|
|
from realtime_ai_character.audio.speech_to_text.base import SpeechToText |
|
from realtime_ai_character.logger import get_logger |
|
from realtime_ai_character.utils import Singleton |
|
|
|
logger = get_logger(__name__) |
|
config = types.SimpleNamespace(**{ |
|
'web': { |
|
'encoding': speech.RecognitionConfig.AudioEncoding.WEBM_OPUS, |
|
'sample_rate_hertz': 48000, |
|
'language_code': 'en-US', |
|
'max_alternatives': 1, |
|
}, |
|
'terminal': { |
|
'encoding': speech.RecognitionConfig.AudioEncoding.LINEAR16, |
|
'sample_rate_hertz': 44100, |
|
'language_code': 'en-US', |
|
'max_alternatives': 1, |
|
}, |
|
}) |
|
|
|
|
|
class Google(Singleton, SpeechToText): |
|
def __init__(self): |
|
super().__init__() |
|
logger.info("Setting up [Google Speech to Text]...") |
|
self.client = speech.SpeechClient() |
|
|
|
def transcribe(self, audio_bytes, platform, prompt='') -> str: |
|
batch_config = speech.RecognitionConfig({ |
|
'speech_contexts': [speech.SpeechContext(phrases=prompt.split(','))], |
|
**config.__dict__[platform]}) |
|
response = self.client.recognize( |
|
config=batch_config, |
|
audio=speech.RecognitionAudio(content=audio_bytes) |
|
) |
|
if not response.results: |
|
return '' |
|
result = response.results[0] |
|
if not result.alternatives: |
|
return '' |
|
return result.alternatives[0].transcript |
|
|