Spaces:
Sleeping
Sleeping
import gradio as gr | |
from huggingface_hub import InferenceClient | |
from transformers import pipeline | |
import edge_tts | |
import tempfile | |
import asyncio | |
# Initialize the inference client with your Hugging Face token | |
client = InferenceClient("mistralai/Mistral-7B-Instruct-v0.1") | |
# Initialize the ASR pipeline | |
asr = pipeline("automatic-speech-recognition", "facebook/wav2vec2-base-960h") | |
def speech_to_text(speech): | |
"""Converts speech to text using the ASR pipeline.""" | |
return asr(speech)["text"] | |
def classify_mood(input_string): | |
"""Classifies the mood based on keywords in the input string.""" | |
input_string = input_string.lower() | |
mood_words = {"happy", "sad", "instrumental", "party"} | |
for word in mood_words: | |
if word in input_string: | |
return word, True | |
return None, False | |
def generate(prompt, history, temperature=0.1, max_new_tokens=2048, top_p=0.8, repetition_penalty=1.0): | |
temperature = float(temperature) | |
if temperature < 1e-2: | |
temperature = 1e-2 | |
top_p = float(top_p) | |
generate_kwargs = dict( | |
temperature=temperature, | |
max_new_tokens=max_new_tokens, | |
top_p=top_p, | |
repetition_penalty=repetition_penalty, | |
do_sample=True, | |
seed=42, | |
) | |
formatted_prompt = format_prompt(prompt, history) | |
stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False) | |
output = "" | |
for response in stream: | |
output += response.token.text | |
mood, is_classified = classify_mood(output) | |
if is_classified: | |
playlist_message = f"Playing {mood.capitalize()} playlist for you!" | |
return playlist_message | |
return output | |
def format_prompt(message, history): | |
"""Formats the prompt including fixed instructions and conversation history.""" | |
fixed_prompt = """ | |
You are a smart mood analyser, who determines user mood. Based on the user input, classify the mood of the user into one of the four moods {Happy, Sad, Instrumental, Party}. If you are finding it difficult to classify into one of these four moods, keep the conversation going on until we classify the user's mood. Return a single-word reply from one of the options if you have classified. Suppose you classify a sentence as happy, then just respond with "happy". | |
Note: Do not write anything else other than the classified mood if classified. | |
Note: If any question or any user text cannot be classified, follow up with a question to know the user's mood until you classify the mood. | |
Note: Mood should be classified only from any of these 4 classes {Happy, Sad, Instrumental, Party}, if not any of these 4 then continue with a follow-up question until you classify the mood. | |
Note: if user asks something like i need a coffee then do not classify the mood directly and ask more follow-up questions as asked in examples. | |
[Examples omitted for brevity] | |
""" | |
prompt = f"{fixed_prompt}" | |
for user_prompt, bot_response in history: | |
prompt += f"\nUser: {user_prompt}\nLLM Response: {bot_response}" | |
prompt += f"\nUser: {message}\nLLM Response:" | |
return prompt | |
async def text_to_speech(text): | |
communicate = edge_tts.Communicate(text) | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file: | |
tmp_path = tmp_file.name | |
await communicate.save(tmp_path) | |
return tmp_path | |
def chatbot(audio, history): | |
if audio is None: | |
return "", history | |
text = speech_to_text(audio) | |
response = generate(text, history) | |
history.append((text, response)) | |
return "", history | |
def text_input(text, history): | |
response = generate(text, history) | |
history.append((text, response)) | |
return "", history | |
async def generate_audio(history): | |
if len(history) > 0: | |
last_response = history[-1][1] | |
audio_path = await text_to_speech(last_response) | |
return audio_path | |
return None | |
# Gradio interface setup | |
with gr.Blocks() as demo: | |
gr.Markdown("# Mood-Based Music Recommender with Voice Chat") | |
chatbot = gr.Chatbot() | |
audio_input = gr.Audio(source="microphone", type="filepath") | |
text_input = gr.Textbox(placeholder="Type your message here...") | |
audio_output = gr.Audio(label="AI Response") | |
audio_input.change(chatbot, inputs=[audio_input, chatbot], outputs=[audio_input, chatbot]) | |
text_input.submit(text_input, inputs=[text_input, chatbot], outputs=[text_input, chatbot]) | |
chatbot.change(generate_audio, inputs=[chatbot], outputs=[audio_output]) | |
if __name__ == "__main__": | |
demo.launch() |