Spaces:
Sleeping
Sleeping
File size: 4,627 Bytes
e4c39da f8d538d e4c39da f8d538d e4c39da f8d538d e4c39da f8d538d e4c39da f8d538d e4c39da f8d538d e4c39da f8d538d 2bcdf1f f8d538d 2bcdf1f f8d538d 2bcdf1f f8d538d 2bcdf1f f8d538d e4c39da f8d538d 2bcdf1f f8d538d 2bcdf1f f8d538d 5251d1b f8d538d e4c39da f8d538d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 |
import gradio as gr
from huggingface_hub import InferenceClient
from transformers import pipeline
import edge_tts
import tempfile
import asyncio
# Initialize the inference client with your Hugging Face token
client = InferenceClient("mistralai/Mistral-7B-Instruct-v0.1")
# Initialize the ASR pipeline
asr = pipeline("automatic-speech-recognition", "facebook/wav2vec2-base-960h")
def speech_to_text(speech):
"""Converts speech to text using the ASR pipeline."""
return asr(speech)["text"]
def classify_mood(input_string):
"""Classifies the mood based on keywords in the input string."""
input_string = input_string.lower()
mood_words = {"happy", "sad", "instrumental", "party"}
for word in mood_words:
if word in input_string:
return word, True
return None, False
def generate(prompt, history, temperature=0.1, max_new_tokens=2048, top_p=0.8, repetition_penalty=1.0):
temperature = float(temperature)
if temperature < 1e-2:
temperature = 1e-2
top_p = float(top_p)
generate_kwargs = dict(
temperature=temperature,
max_new_tokens=max_new_tokens,
top_p=top_p,
repetition_penalty=repetition_penalty,
do_sample=True,
seed=42,
)
formatted_prompt = format_prompt(prompt, history)
stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
output = ""
for response in stream:
output += response.token.text
mood, is_classified = classify_mood(output)
if is_classified:
playlist_message = f"Playing {mood.capitalize()} playlist for you!"
return playlist_message
return output
def format_prompt(message, history):
"""Formats the prompt including fixed instructions and conversation history."""
fixed_prompt = """
You are a smart mood analyser, who determines user mood. Based on the user input, classify the mood of the user into one of the four moods {Happy, Sad, Instrumental, Party}. If you are finding it difficult to classify into one of these four moods, keep the conversation going on until we classify the user's mood. Return a single-word reply from one of the options if you have classified. Suppose you classify a sentence as happy, then just respond with "happy".
Note: Do not write anything else other than the classified mood if classified.
Note: If any question or any user text cannot be classified, follow up with a question to know the user's mood until you classify the mood.
Note: Mood should be classified only from any of these 4 classes {Happy, Sad, Instrumental, Party}, if not any of these 4 then continue with a follow-up question until you classify the mood.
Note: if user asks something like i need a coffee then do not classify the mood directly and ask more follow-up questions as asked in examples.
[Examples omitted for brevity]
"""
prompt = f"{fixed_prompt}"
for user_prompt, bot_response in history:
prompt += f"\nUser: {user_prompt}\nLLM Response: {bot_response}"
prompt += f"\nUser: {message}\nLLM Response:"
return prompt
async def text_to_speech(text):
communicate = edge_tts.Communicate(text)
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
tmp_path = tmp_file.name
await communicate.save(tmp_path)
return tmp_path
def chatbot(audio, history):
if audio is None:
return "", history
text = speech_to_text(audio)
response = generate(text, history)
history.append((text, response))
return "", history
def text_input(text, history):
response = generate(text, history)
history.append((text, response))
return "", history
async def generate_audio(history):
if len(history) > 0:
last_response = history[-1][1]
audio_path = await text_to_speech(last_response)
return audio_path
return None
# Gradio interface setup
with gr.Blocks() as demo:
gr.Markdown("# Mood-Based Music Recommender with Voice Chat")
chatbot = gr.Chatbot()
audio_input = gr.Audio(source="microphone", type="filepath")
text_input = gr.Textbox(placeholder="Type your message here...")
audio_output = gr.Audio(label="AI Response")
audio_input.change(chatbot, inputs=[audio_input, chatbot], outputs=[audio_input, chatbot])
text_input.submit(text_input, inputs=[text_input, chatbot], outputs=[text_input, chatbot])
chatbot.change(generate_audio, inputs=[chatbot], outputs=[audio_output])
if __name__ == "__main__":
demo.launch() |