Spaces:
Sleeping
Sleeping
File size: 3,367 Bytes
d51e19d 3721320 d51e19d 1b34aa5 d51e19d 3721320 ed73c38 1b34aa5 6ee0077 1b34aa5 3721320 1b34aa5 d51e19d 3721320 d51e19d 3721320 d51e19d 86b5bcb d51e19d 5ba294c d51e19d 6ee0077 d51e19d ed73c38 86b5bcb ed73c38 6ee0077 3721320 1b34aa5 ed73c38 86b5bcb 1b34aa5 ed73c38 3721320 6ee0077 d51e19d ed73c38 1b34aa5 d51e19d 6ee0077 d51e19d 6ee0077 d51e19d b4920d8 ed73c38 d51e19d c8b7fcf ed73c38 f76ff96 6ee0077 d51e19d 2b54ce7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
import torch
import spaces
import numpy as np
import gradio as gr
from gtts import gTTS
from transformers import pipeline
from huggingface_hub import InferenceClient
# Model names
ASR_MODEL_NAME = "openai/whisper-small"
LLM_MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"
# Initial system prompt
system_prompt = """"<s>[INST] You are Friday, a helpful and conversational AI assistant, and you respond with one to two sentences. [/INST] Hello there! I'm Friday, how can I help you?</s>"""
# Global variables for initial history
initial_instruct_history = system_prompt
initial_formatted_history = ""
# Create inference client for text generation
client = InferenceClient(LLM_MODEL_NAME)
# Set device for ASR pipeline
device = 0 if torch.cuda.is_available() else "cpu"
# ASR pipeline
pipe = pipeline(
task="automatic-speech-recognition",
model=ASR_MODEL_NAME,
device=device,
)
def generate(instruct_history, temperature=0.1, max_new_tokens=128, top_p=0.95, repetition_penalty=1.0):
temperature = float(temperature)
if temperature < 1e-2:
temperature = 1e-2
top_p = float(top_p)
generate_kwargs = dict(
temperature=temperature,
max_new_tokens=max_new_tokens,
top_p=top_p,
repetition_penalty=repetition_penalty,
do_sample=True,
seed=42,
)
output = client.text_generation(
instruct_history, **generate_kwargs, stream=False, details=False, return_full_text=False)
return output
@spaces.GPU(duration=60)
def transcribe(audio, instruct_history, formatted_history):
sr, y = audio
y = y.astype(np.float32)
y /= np.max(np.abs(y))
# Transcribe user audio
transcribed_user_audio = pipe({"sampling_rate": sr, "raw": y})["text"]
# Append user input to history
formatted_history += f"Human: {transcribed_user_audio}\n\n"
instruct_history += f"<s>[INST] {transcribed_user_audio} [/INST] "
# Generate LLM response
llm_response = generate(instruct_history)
# Append AI response to history
instruct_history += f" {llm_response}</s>"
formatted_history += f"Friday: {llm_response}\n\n"
# Convert AI response to audio
audio_response = gTTS(llm_response)
audio_response.save("response.mp3")
# Return the updated history and audio
return "response.mp3", formatted_history, instruct_history
with gr.Blocks() as demo:
gr.HTML("<center><h1>Friday: AI Virtual Assistant</h1><center>")
# Initialize state
instruct_state = gr.State(value=initial_instruct_history)
formatted_state = gr.State(value=initial_formatted_history)
with gr.Row():
audio_input = gr.Audio(label="Human", sources="microphone")
output_audio = gr.Audio(label="Friday", type="filepath", interactive=False, autoplay=True, elem_classes="audio")
transcribe_btn = gr.Button("Transcribe")
# Textbox to display the full conversation history
transcription_box = gr.Textbox(label="Transcription", lines=10, placeholder="Conversation History...")
# Pass states to the transcribe function and update them after each click
transcribe_btn.click(
fn=transcribe,
inputs=[audio_input, instruct_state, formatted_state],
outputs=[output_audio, transcription_box, instruct_state, formatted_state]
)
if __name__ == "__main__":
demo.queue()
demo.launch()
|