Voice-To-Voice_test

Sleeping

File size: 4,627 Bytes

e4c39da
f8d538d
 
 
 
 
e4c39da
f8d538d
 
 
 
e4c39da
f8d538d
 
 
e4c39da
f8d538d
 
 
 
 
 
 
 
e4c39da
f8d538d
 
 
 
 
e4c39da
f8d538d
 
 
 
 
 
 
 
e4c39da
f8d538d
2bcdf1f
f8d538d
 
2bcdf1f
f8d538d
 
 
 
 
 
 
2bcdf1f
f8d538d
 
 
 
2bcdf1f
f8d538d
e4c39da
f8d538d
 
 
 
 
2bcdf1f
f8d538d
 
 
 
 
 
 
2bcdf1f
f8d538d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5251d1b
f8d538d
 
 
 
e4c39da
f8d538d

import gradio as gr
from huggingface_hub import InferenceClient
from transformers import pipeline
import edge_tts
import tempfile
import asyncio

# Initialize the inference client with your Hugging Face token
client = InferenceClient("mistralai/Mistral-7B-Instruct-v0.1")
# Initialize the ASR pipeline
asr = pipeline("automatic-speech-recognition", "facebook/wav2vec2-base-960h")

def speech_to_text(speech):
    """Converts speech to text using the ASR pipeline."""
    return asr(speech)["text"]

def classify_mood(input_string):
    """Classifies the mood based on keywords in the input string."""
    input_string = input_string.lower()
    mood_words = {"happy", "sad", "instrumental", "party"}
    for word in mood_words:
        if word in input_string:
            return word, True
    return None, False

def generate(prompt, history, temperature=0.1, max_new_tokens=2048, top_p=0.8, repetition_penalty=1.0):
    temperature = float(temperature)
    if temperature < 1e-2:
        temperature = 1e-2
    top_p = float(top_p)

    generate_kwargs = dict(
        temperature=temperature,
        max_new_tokens=max_new_tokens,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        do_sample=True,
        seed=42,
    )

    formatted_prompt = format_prompt(prompt, history)

    stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
    output = ""

    for response in stream:
        output += response.token.text
        mood, is_classified = classify_mood(output)
        if is_classified:
            playlist_message = f"Playing {mood.capitalize()} playlist for you!"
            return playlist_message
    return output

def format_prompt(message, history):
    """Formats the prompt including fixed instructions and conversation history."""
    fixed_prompt = """
    You are a smart mood analyser, who determines user mood. Based on the user input, classify the mood of the user into one of the four moods {Happy, Sad, Instrumental, Party}. If you are finding it difficult to classify into one of these four moods, keep the conversation going on until we classify the user's mood. Return a single-word reply from one of the options if you have classified. Suppose you classify a sentence as happy, then just respond with "happy".

    Note: Do not write anything else other than the classified mood if classified.

    Note: If any question or any user text cannot be classified, follow up with a question to know the user's mood until you classify the mood.

    Note: Mood should be classified only from any of these 4 classes {Happy, Sad, Instrumental, Party}, if not any of these 4 then continue with a follow-up question until you classify the mood.

    Note: if user asks something like i need a coffee then do not classify the mood directly and ask more follow-up questions as asked in examples.

    [Examples omitted for brevity]
    """
    prompt = f"{fixed_prompt}"
    for user_prompt, bot_response in history:
        prompt += f"\nUser: {user_prompt}\nLLM Response: {bot_response}"
    prompt += f"\nUser: {message}\nLLM Response:"
    return prompt

async def text_to_speech(text):
    communicate = edge_tts.Communicate(text)
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
        tmp_path = tmp_file.name
        await communicate.save(tmp_path)
    return tmp_path

def chatbot(audio, history):
    if audio is None:
        return "", history
    
    text = speech_to_text(audio)
    response = generate(text, history)
    history.append((text, response))
    
    return "", history

def text_input(text, history):
    response = generate(text, history)
    history.append((text, response))
    return "", history

async def generate_audio(history):
    if len(history) > 0:
        last_response = history[-1][1]
        audio_path = await text_to_speech(last_response)
        return audio_path
    return None

# Gradio interface setup
with gr.Blocks() as demo:
    gr.Markdown("# Mood-Based Music Recommender with Voice Chat")
    
    chatbot = gr.Chatbot()
    audio_input = gr.Audio(source="microphone", type="filepath")
    text_input = gr.Textbox(placeholder="Type your message here...")
    audio_output = gr.Audio(label="AI Response")

    audio_input.change(chatbot, inputs=[audio_input, chatbot], outputs=[audio_input, chatbot])
    text_input.submit(text_input, inputs=[text_input, chatbot], outputs=[text_input, chatbot])
    
    chatbot.change(generate_audio, inputs=[chatbot], outputs=[audio_output])

if __name__ == "__main__":
    demo.launch()