import numpy as np import subprocess import soundfile as sf from speech_recognition import AudioFile, Recognizer greeting_list = ["いらっしゃいませ", "いらっしゃい", "いらっしゃいませー", "こんにちは", "おはようございます", "おはよう", "おはよー", "おはー", ] def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array: """ Helper function to read an audio file through ffmpeg. """ ar = f"{sampling_rate}" ac = "1" format_for_conversion = "f32le" ffmpeg_command = [ "ffmpeg", "-i", "pipe:0", "-ac", ac, "-ar", ar, "-f", format_for_conversion, "-hide_banner", "-loglevel", "quiet", "pipe:1", ] try: ffmpeg_process = subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE) except FileNotFoundError: raise ValueError("ffmpeg was not found but is required to load audio files from filename") output_stream = ffmpeg_process.communicate(bpayload) out_bytes = output_stream[0] audio = np.frombuffer(out_bytes, np.float32) sf.write('temp.wav', audio, sampling_rate, subtype='PCM_16') return 'temp.wav' def stt(audio: object, language='ja') -> str: """Converts speech to text. Args: audio: record of user speech language (str): language of text Returns: text (str): recognized speech of user """ # Create a Recognizer object r = Recognizer() # Open the audio file with AudioFile(audio) as source: # Listen for the data (load audio to memory) audio_data = r.record(source) # Transcribe the audio using Google's speech-to-text API text = r.recognize_google(audio_data, language=language) return text