vumichien's picture
Create utils.py
8148b06
raw
history blame
1.98 kB
import numpy as np
import subprocess
import soundfile as sf
from speech_recognition import AudioFile, Recognizer
greeting_list = ["いらっしゃいませ",
"いらっしゃい",
"いらっしゃいませー",
"こんにけは",
"γŠγ―γ‚ˆγ†γ”γ–γ„γΎγ™",
"γŠγ―γ‚ˆγ†",
"γŠγ―γ‚ˆγƒΌ",
"γŠγ―γƒΌ",
]
def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array:
"""
Helper function to read an audio file through ffmpeg.
"""
ar = f"{sampling_rate}"
ac = "1"
format_for_conversion = "f32le"
ffmpeg_command = [
"ffmpeg",
"-i",
"pipe:0",
"-ac",
ac,
"-ar",
ar,
"-f",
format_for_conversion,
"-hide_banner",
"-loglevel",
"quiet",
"pipe:1",
]
try:
ffmpeg_process = subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
except FileNotFoundError:
raise ValueError("ffmpeg was not found but is required to load audio files from filename")
output_stream = ffmpeg_process.communicate(bpayload)
out_bytes = output_stream[0]
audio = np.frombuffer(out_bytes, np.float32)
sf.write('temp.wav', audio, sampling_rate, subtype='PCM_16')
return 'temp.wav'
def stt(audio: object, language='ja') -> str:
"""Converts speech to text.
Args:
audio: record of user speech
language (str): language of text
Returns:
text (str): recognized speech of user
"""
# Create a Recognizer object
r = Recognizer()
# Open the audio file
with AudioFile(audio) as source:
# Listen for the data (load audio to memory)
audio_data = r.record(source)
# Transcribe the audio using Google's speech-to-text API
text = r.recognize_google(audio_data, language=language)
return text