Spaces:

gospacedev
/

friday

Sleeping

File size: 3,150 Bytes

d51e19d
 
 
 
 
 
 
 
3721320
d51e19d
1b34aa5
d51e19d
3721320
ed73c38
1b34aa5
61b53d6
 
 
1b34aa5
3721320
1b34aa5
d51e19d
3721320
d51e19d
 
3721320
d51e19d
 
 
 
 
 
86b5bcb
d51e19d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5ba294c
d51e19d
 
 
 
61b53d6
 
 
d51e19d
 
 
 
ed73c38
86b5bcb
 
ed73c38
61b53d6
 
3721320
1b34aa5
ed73c38
86b5bcb
1b34aa5
ed73c38
3721320
61b53d6
d51e19d
ed73c38
1b34aa5
d51e19d
 
61b53d6
d51e19d
61b53d6
 
6ee0077
61b53d6
 
d51e19d
 
b4920d8
ed73c38
d51e19d
 
c8b7fcf
ed73c38
 
f76ff96
61b53d6
d51e19d
2b54ce7
 
61b53d6

import torch
import spaces
import numpy as np
import gradio as gr
from gtts import gTTS
from transformers import pipeline
from huggingface_hub import InferenceClient

# Model names
ASR_MODEL_NAME = "openai/whisper-small"
LLM_MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"

# Initial system prompt
system_prompt = """"<s>[INST] You are Friday, a helpful and conversational AI assistant, and you respond with one to two sentences. [/INST] Hello there! I'm Friday, how can I help you?</s>"""

# Global variables for history
instruct_history = system_prompt
formatted_history = ""

# Create inference client for text generation
client = InferenceClient(LLM_MODEL_NAME)

# Set device for ASR pipeline
device = 0 if torch.cuda.is_available() else "cpu"

# ASR pipeline
pipe = pipeline(
    task="automatic-speech-recognition",
    model=ASR_MODEL_NAME,
    device=device,
)

def generate(instruct_history, temperature=0.1, max_new_tokens=128, top_p=0.95, repetition_penalty=1.0):
    temperature = float(temperature)
    if temperature < 1e-2:
        temperature = 1e-2
    top_p = float(top_p)

    generate_kwargs = dict(
        temperature=temperature,
        max_new_tokens=max_new_tokens,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        do_sample=True,
        seed=42,
    )

    output = client.text_generation(
        instruct_history, **generate_kwargs, stream=False, details=False, return_full_text=False)

    return output

@spaces.GPU(duration=60)
def transcribe(audio, past_history):
    global instruct_history, formatted_history

    sr, y = audio
    y = y.astype(np.float32)
    y /= np.max(np.abs(y))

    # Transcribe user audio
    transcribed_user_audio = pipe({"sampling_rate": sr, "raw": y})["text"]

    # Append user input to history
    formatted_history += past_history
    formatted_history += f"😃 Human: {transcribed_user_audio}\n\n"
    instruct_history += f"<s>[INST] {transcribed_user_audio} [/INST] "

    # Generate LLM response
    llm_response = generate(instruct_history)

    # Append AI response to history
    instruct_history += f" {llm_response}</s>"
    formatted_history += f"🤖 Friday: {llm_response}\n\n"

    # Convert AI response to audio
    audio_response = gTTS(llm_response)
    audio_response.save("response.mp3")

    print("Formatted History: ", formatted_history)

    # Return the full conversation history
    return "response.mp3", formatted_history

with gr.Blocks() as demo:
    gr.HTML("<center><h1>Friday: AI Virtual Assistant 🤖</h1><center>")

    with gr.Row():
        audio_input = gr.Audio(label="Human", sources="microphone")
        output_audio = gr.Audio(label="Friday", type="filepath", interactive=False, autoplay=True, elem_classes="audio")

    transcribe_btn = gr.Button("Transcribe")

    # Textbox to display the full conversation history
    transcription_box = gr.Textbox(label="Transcription", lines=10, placeholder="Conversation History...")

    transcribe_btn.click(fn=transcribe, inputs=[audio_input, transcription_box], outputs=[output_audio, transcription_box])

if __name__ == "__main__":
    demo.queue()
    demo.launch()