Spaces:
Sleeping
Sleeping
File size: 2,195 Bytes
d51e19d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
import torch
import spaces
import numpy as np
import gradio as gr
from gtts import gTTS
from transformers import pipeline
from huggingface_hub import InferenceClient
ASR_MODEL_NAME = "openai/whisper-small"
NLP_MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"
system_prompt = """"<s> [INST] You are Friday a helpful and conversational assistant. [/INST]"""
client = InferenceClient(NLP_MODEL_NAME)
device = 0 if torch.cuda.is_available() else "cpu"
pipe = pipeline(
task="automatic-speech-recognition",
model=ASR_MODEL_NAME,
device=device,
)
def generate(prompt, temperature=0.1, max_new_tokens=64, top_p=0.95, repetition_penalty=1.0):
temperature = float(temperature)
if temperature < 1e-2:
temperature = 1e-2
top_p = float(top_p)
generate_kwargs = dict(
temperature=temperature,
max_new_tokens=max_new_tokens,
top_p=top_p,
repetition_penalty=repetition_penalty,
do_sample=True,
seed=42,
)
formatted_prompt = system_prompt + f""" {prompt} </s>"""
output = client.text_generation(
formatted_prompt, **generate_kwargs, stream=False, details=False, return_full_text=False)
print(output)
return output
@spaces.GPU(duration=60)
def transcribe(audio):
sr, y = audio
y = y.astype(np.float32)
y /= np.max(np.abs(y))
inputs = pipe({"sampling_rate": sr, "raw": y})["text"]
print("User transcription: ", inputs)
response = generate(inputs)
audio_response = gTTS(response)
audio_response.save("response.mp3")
print(audio_response)
return "response.mp3"
with gr.Blocks() as demo:
gr.HTML("<center><h1>Friday: AI Virtual Assistant<h1><center>")
with gr.Row():
audio_input = gr.Audio(label="Human", sources="microphone")
output_audio = gr.Audio(label="Friday", type="filepath",
interactive=False,
autoplay=True,
elem_classes="audio")
transcribe_btn = gr.Button("Transcribe")
transcribe_btn.click(fn=transcribe, inputs=audio_input,
outputs=output_audio)
demo.queue()
demo.launch()
|