File size: 2,195 Bytes
d51e19d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import torch
import spaces
import numpy as np
import gradio as gr
from gtts import gTTS
from transformers import pipeline
from huggingface_hub import InferenceClient


ASR_MODEL_NAME = "openai/whisper-small"
NLP_MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"
system_prompt = """"<s> [INST] You are Friday a helpful and conversational assistant. [/INST]"""

client = InferenceClient(NLP_MODEL_NAME)

device = 0 if torch.cuda.is_available() else "cpu"

pipe = pipeline(
    task="automatic-speech-recognition",
    model=ASR_MODEL_NAME,
    device=device,
)


def generate(prompt, temperature=0.1, max_new_tokens=64, top_p=0.95, repetition_penalty=1.0):
    temperature = float(temperature)
    if temperature < 1e-2:
        temperature = 1e-2
    top_p = float(top_p)

    generate_kwargs = dict(
        temperature=temperature,
        max_new_tokens=max_new_tokens,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        do_sample=True,
        seed=42,
    )

    formatted_prompt = system_prompt + f""" {prompt} </s>"""

    output = client.text_generation(
        formatted_prompt, **generate_kwargs, stream=False, details=False, return_full_text=False)

    print(output)
    return output


@spaces.GPU(duration=60)
def transcribe(audio):
    sr, y = audio
    y = y.astype(np.float32)
    y /= np.max(np.abs(y))

    inputs = pipe({"sampling_rate": sr, "raw": y})["text"]

    print("User transcription: ", inputs)

    response = generate(inputs)
    audio_response = gTTS(response)
    audio_response.save("response.mp3")

    print(audio_response)

    return "response.mp3"


with gr.Blocks() as demo:
    gr.HTML("<center><h1>Friday: AI Virtual Assistant<h1><center>")

    with gr.Row():
        audio_input = gr.Audio(label="Human", sources="microphone")
        output_audio = gr.Audio(label="Friday", type="filepath",
                                interactive=False,
                                autoplay=True,
                                elem_classes="audio")

    transcribe_btn = gr.Button("Transcribe")
    transcribe_btn.click(fn=transcribe, inputs=audio_input,
                         outputs=output_audio)


demo.queue()
demo.launch()