Spaces:

gospacedev
/

friday

Sleeping

File size: 3,367 Bytes

d51e19d
 
 
 
 
 
 
 
3721320
d51e19d
1b34aa5
d51e19d
3721320
ed73c38
1b34aa5
6ee0077
 
 
1b34aa5
3721320
1b34aa5
d51e19d
3721320
d51e19d
 
3721320
d51e19d
 
 
 
 
 
86b5bcb
d51e19d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5ba294c
d51e19d
 
 
 
6ee0077
d51e19d
 
 
 
ed73c38
86b5bcb
 
ed73c38
6ee0077
3721320
1b34aa5
ed73c38
86b5bcb
1b34aa5
ed73c38
3721320
6ee0077
d51e19d
ed73c38
1b34aa5
d51e19d
 
6ee0077
 
d51e19d
 
6ee0077
 
 
 
 
d51e19d
 
b4920d8
ed73c38
d51e19d
 
c8b7fcf
ed73c38
 
f76ff96
6ee0077
 
 
 
 
 
d51e19d
2b54ce7

import torch
import spaces
import numpy as np
import gradio as gr
from gtts import gTTS
from transformers import pipeline
from huggingface_hub import InferenceClient

# Model names
ASR_MODEL_NAME = "openai/whisper-small"
LLM_MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"

# Initial system prompt
system_prompt = """"<s>[INST] You are Friday, a helpful and conversational AI assistant, and you respond with one to two sentences. [/INST] Hello there! I'm Friday, how can I help you?</s>"""

# Global variables for initial history
initial_instruct_history = system_prompt
initial_formatted_history = ""

# Create inference client for text generation
client = InferenceClient(LLM_MODEL_NAME)

# Set device for ASR pipeline
device = 0 if torch.cuda.is_available() else "cpu"

# ASR pipeline
pipe = pipeline(
    task="automatic-speech-recognition",
    model=ASR_MODEL_NAME,
    device=device,
)

def generate(instruct_history, temperature=0.1, max_new_tokens=128, top_p=0.95, repetition_penalty=1.0):
    temperature = float(temperature)
    if temperature < 1e-2:
        temperature = 1e-2
    top_p = float(top_p)

    generate_kwargs = dict(
        temperature=temperature,
        max_new_tokens=max_new_tokens,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        do_sample=True,
        seed=42,
    )

    output = client.text_generation(
        instruct_history, **generate_kwargs, stream=False, details=False, return_full_text=False)

    return output

@spaces.GPU(duration=60)
def transcribe(audio, instruct_history, formatted_history):
    sr, y = audio
    y = y.astype(np.float32)
    y /= np.max(np.abs(y))

    # Transcribe user audio
    transcribed_user_audio = pipe({"sampling_rate": sr, "raw": y})["text"]

    # Append user input to history
    formatted_history += f"Human: {transcribed_user_audio}\n\n"
    instruct_history += f"<s>[INST] {transcribed_user_audio} [/INST] "

    # Generate LLM response
    llm_response = generate(instruct_history)

    # Append AI response to history
    instruct_history += f" {llm_response}</s>"
    formatted_history += f"Friday: {llm_response}\n\n"

    # Convert AI response to audio
    audio_response = gTTS(llm_response)
    audio_response.save("response.mp3")

    # Return the updated history and audio
    return "response.mp3", formatted_history, instruct_history

with gr.Blocks() as demo:
    gr.HTML("<center><h1>Friday: AI Virtual Assistant</h1><center>")

    # Initialize state
    instruct_state = gr.State(value=initial_instruct_history)
    formatted_state = gr.State(value=initial_formatted_history)

    with gr.Row():
        audio_input = gr.Audio(label="Human", sources="microphone")
        output_audio = gr.Audio(label="Friday", type="filepath", interactive=False, autoplay=True, elem_classes="audio")

    transcribe_btn = gr.Button("Transcribe")

    # Textbox to display the full conversation history
    transcription_box = gr.Textbox(label="Transcription", lines=10, placeholder="Conversation History...")

    # Pass states to the transcribe function and update them after each click
    transcribe_btn.click(
        fn=transcribe,
        inputs=[audio_input, instruct_state, formatted_state],
        outputs=[output_audio, transcription_box, instruct_state, formatted_state]
    )

if __name__ == "__main__":
    demo.queue()
    demo.launch()