File size: 1,981 Bytes
8148b06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import numpy as np
import subprocess
import soundfile as sf
from speech_recognition import AudioFile, Recognizer

greeting_list = ["いらっしゃいませ",
                 "いらっしゃい",
                 "いらっしゃいませー",
                 "こんにけは",
                 "γŠγ―γ‚ˆγ†γ”γ–γ„γΎγ™",
                 "γŠγ―γ‚ˆγ†",
                 "γŠγ―γ‚ˆγƒΌ",
                 "γŠγ―γƒΌ",
                 ]


def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array:
    """
    Helper function to read an audio file through ffmpeg.
    """
    ar = f"{sampling_rate}"
    ac = "1"
    format_for_conversion = "f32le"
    ffmpeg_command = [
        "ffmpeg",
        "-i",
        "pipe:0",
        "-ac",
        ac,
        "-ar",
        ar,
        "-f",
        format_for_conversion,
        "-hide_banner",
        "-loglevel",
        "quiet",
        "pipe:1",
    ]

    try:
        ffmpeg_process = subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
    except FileNotFoundError:
        raise ValueError("ffmpeg was not found but is required to load audio files from filename")
    output_stream = ffmpeg_process.communicate(bpayload)
    out_bytes = output_stream[0]
    audio = np.frombuffer(out_bytes, np.float32)
    sf.write('temp.wav', audio, sampling_rate, subtype='PCM_16')
    return 'temp.wav'


def stt(audio: object, language='ja') -> str:
    """Converts speech to text.
    Args:
        audio: record of user speech
        language (str): language of text
    Returns:
        text (str): recognized speech of user
    """
    # Create a Recognizer object
    r = Recognizer()
    # Open the audio file
    with AudioFile(audio) as source:
        # Listen for the data (load audio to memory)
        audio_data = r.record(source)
        # Transcribe the audio using Google's speech-to-text API
        text = r.recognize_google(audio_data, language=language)
    return text