friday / app.py
gospacedev's picture
Create app.py
d51e19d verified
raw
history blame
2.2 kB
import torch
import spaces
import numpy as np
import gradio as gr
from gtts import gTTS
from transformers import pipeline
from huggingface_hub import InferenceClient
ASR_MODEL_NAME = "openai/whisper-small"
NLP_MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"
system_prompt = """"<s> [INST] You are Friday a helpful and conversational assistant. [/INST]"""
client = InferenceClient(NLP_MODEL_NAME)
device = 0 if torch.cuda.is_available() else "cpu"
pipe = pipeline(
task="automatic-speech-recognition",
model=ASR_MODEL_NAME,
device=device,
)
def generate(prompt, temperature=0.1, max_new_tokens=64, top_p=0.95, repetition_penalty=1.0):
temperature = float(temperature)
if temperature < 1e-2:
temperature = 1e-2
top_p = float(top_p)
generate_kwargs = dict(
temperature=temperature,
max_new_tokens=max_new_tokens,
top_p=top_p,
repetition_penalty=repetition_penalty,
do_sample=True,
seed=42,
)
formatted_prompt = system_prompt + f""" {prompt} </s>"""
output = client.text_generation(
formatted_prompt, **generate_kwargs, stream=False, details=False, return_full_text=False)
print(output)
return output
@spaces.GPU(duration=60)
def transcribe(audio):
sr, y = audio
y = y.astype(np.float32)
y /= np.max(np.abs(y))
inputs = pipe({"sampling_rate": sr, "raw": y})["text"]
print("User transcription: ", inputs)
response = generate(inputs)
audio_response = gTTS(response)
audio_response.save("response.mp3")
print(audio_response)
return "response.mp3"
with gr.Blocks() as demo:
gr.HTML("<center><h1>Friday: AI Virtual Assistant<h1><center>")
with gr.Row():
audio_input = gr.Audio(label="Human", sources="microphone")
output_audio = gr.Audio(label="Friday", type="filepath",
interactive=False,
autoplay=True,
elem_classes="audio")
transcribe_btn = gr.Button("Transcribe")
transcribe_btn.click(fn=transcribe, inputs=audio_input,
outputs=output_audio)
demo.queue()
demo.launch()