Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,110 +1,119 @@
|
|
1 |
-
import os
|
2 |
import gradio as gr
|
3 |
-
import
|
4 |
-
from
|
5 |
-
import
|
6 |
-
|
|
|
7 |
|
8 |
-
# Initialize the
|
9 |
-
client =
|
|
|
|
|
10 |
|
11 |
-
|
12 |
-
|
|
|
13 |
|
14 |
-
def
|
15 |
-
|
16 |
-
|
17 |
-
|
|
|
|
|
|
|
|
|
18 |
|
19 |
-
|
20 |
-
|
21 |
-
|
|
|
|
|
22 |
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
|
|
|
|
|
|
28 |
|
29 |
-
|
30 |
-
response_message = chat_completion.choices[0].message.content.strip()
|
31 |
|
32 |
-
|
33 |
-
|
34 |
-
response_audio_io = io.BytesIO()
|
35 |
-
tts.write_to_fp(response_audio_io) # Save the audio to the BytesIO object
|
36 |
-
response_audio_io.seek(0)
|
37 |
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
|
|
|
|
|
|
42 |
|
43 |
-
|
44 |
-
|
|
|
|
|
45 |
|
46 |
-
|
47 |
-
return f"An error occurred: {e}", None
|
48 |
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
.gradio-container {
|
55 |
-
font-family: Arial, sans-serif;
|
56 |
-
background-color: #f0f4c3; /* Light green background color */
|
57 |
-
border-radius: 10px;
|
58 |
-
padding: 20px;
|
59 |
-
box-shadow: 0 4px 12px rgba(0,0,0,0.2);
|
60 |
-
text-align: center;
|
61 |
-
}
|
62 |
-
.gradio-input, .gradio-output {
|
63 |
-
border-radius: 6px;
|
64 |
-
border: 1px solid #ddd;
|
65 |
-
padding: 10px;
|
66 |
-
}
|
67 |
-
.gradio-button {
|
68 |
-
background-color: #ff7043;
|
69 |
-
color: white;
|
70 |
-
border-radius: 6px;
|
71 |
-
border: none;
|
72 |
-
padding: 10px 20px; /* Adjusted padding */
|
73 |
-
font-size: 16px; /* Adjusted font size */
|
74 |
-
cursor: pointer;
|
75 |
-
}
|
76 |
-
.gradio-button:hover {
|
77 |
-
background-color: #e64a19;
|
78 |
-
}
|
79 |
-
.gradio-title {
|
80 |
-
font-size: 28px;
|
81 |
-
font-weight: bold;
|
82 |
-
margin-bottom: 20px;
|
83 |
-
color: #37474f;
|
84 |
-
}
|
85 |
-
.gradio-description {
|
86 |
-
font-size: 16px;
|
87 |
-
margin-bottom: 20px;
|
88 |
-
color: #616161;
|
89 |
-
}
|
90 |
-
</style>
|
91 |
-
"""
|
92 |
-
)
|
93 |
|
94 |
-
|
95 |
-
|
|
|
|
|
|
|
|
|
|
|
96 |
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
|
106 |
-
|
107 |
-
|
|
|
|
|
108 |
|
109 |
-
|
110 |
-
demo.launch()
|
|
|
|
|
1 |
import gradio as gr
|
2 |
+
from huggingface_hub import InferenceClient
|
3 |
+
from transformers import pipeline
|
4 |
+
import edge_tts
|
5 |
+
import tempfile
|
6 |
+
import asyncio
|
7 |
|
8 |
+
# Initialize the inference client with your Hugging Face token
|
9 |
+
client = InferenceClient("mistralai/Mistral-7B-Instruct-v0.1")
|
10 |
+
# Initialize the ASR pipeline
|
11 |
+
asr = pipeline("automatic-speech-recognition", "facebook/wav2vec2-base-960h")
|
12 |
|
13 |
+
def speech_to_text(speech):
|
14 |
+
"""Converts speech to text using the ASR pipeline."""
|
15 |
+
return asr(speech)["text"]
|
16 |
|
17 |
+
def classify_mood(input_string):
|
18 |
+
"""Classifies the mood based on keywords in the input string."""
|
19 |
+
input_string = input_string.lower()
|
20 |
+
mood_words = {"happy", "sad", "instrumental", "party"}
|
21 |
+
for word in mood_words:
|
22 |
+
if word in input_string:
|
23 |
+
return word, True
|
24 |
+
return None, False
|
25 |
|
26 |
+
def generate(prompt, history, temperature=0.1, max_new_tokens=2048, top_p=0.8, repetition_penalty=1.0):
|
27 |
+
temperature = float(temperature)
|
28 |
+
if temperature < 1e-2:
|
29 |
+
temperature = 1e-2
|
30 |
+
top_p = float(top_p)
|
31 |
|
32 |
+
generate_kwargs = dict(
|
33 |
+
temperature=temperature,
|
34 |
+
max_new_tokens=max_new_tokens,
|
35 |
+
top_p=top_p,
|
36 |
+
repetition_penalty=repetition_penalty,
|
37 |
+
do_sample=True,
|
38 |
+
seed=42,
|
39 |
+
)
|
40 |
|
41 |
+
formatted_prompt = format_prompt(prompt, history)
|
|
|
42 |
|
43 |
+
stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
|
44 |
+
output = ""
|
|
|
|
|
|
|
45 |
|
46 |
+
for response in stream:
|
47 |
+
output += response.token.text
|
48 |
+
mood, is_classified = classify_mood(output)
|
49 |
+
if is_classified:
|
50 |
+
playlist_message = f"Playing {mood.capitalize()} playlist for you!"
|
51 |
+
return playlist_message
|
52 |
+
return output
|
53 |
|
54 |
+
def format_prompt(message, history):
|
55 |
+
"""Formats the prompt including fixed instructions and conversation history."""
|
56 |
+
fixed_prompt = """
|
57 |
+
You are a smart mood analyser, who determines user mood. Based on the user input, classify the mood of the user into one of the four moods {Happy, Sad, Instrumental, Party}. If you are finding it difficult to classify into one of these four moods, keep the conversation going on until we classify the user's mood. Return a single-word reply from one of the options if you have classified. Suppose you classify a sentence as happy, then just respond with "happy".
|
58 |
|
59 |
+
Note: Do not write anything else other than the classified mood if classified.
|
|
|
60 |
|
61 |
+
Note: If any question or any user text cannot be classified, follow up with a question to know the user's mood until you classify the mood.
|
62 |
+
|
63 |
+
Note: Mood should be classified only from any of these 4 classes {Happy, Sad, Instrumental, Party}, if not any of these 4 then continue with a follow-up question until you classify the mood.
|
64 |
+
|
65 |
+
Note: if user asks something like i need a coffee then do not classify the mood directly and ask more follow-up questions as asked in examples.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
|
67 |
+
[Examples omitted for brevity]
|
68 |
+
"""
|
69 |
+
prompt = f"{fixed_prompt}"
|
70 |
+
for user_prompt, bot_response in history:
|
71 |
+
prompt += f"\nUser: {user_prompt}\nLLM Response: {bot_response}"
|
72 |
+
prompt += f"\nUser: {message}\nLLM Response:"
|
73 |
+
return prompt
|
74 |
|
75 |
+
async def text_to_speech(text):
|
76 |
+
communicate = edge_tts.Communicate(text)
|
77 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
|
78 |
+
tmp_path = tmp_file.name
|
79 |
+
await communicate.save(tmp_path)
|
80 |
+
return tmp_path
|
81 |
+
|
82 |
+
def chatbot(audio, history):
|
83 |
+
if audio is None:
|
84 |
+
return "", history
|
85 |
+
|
86 |
+
text = speech_to_text(audio)
|
87 |
+
response = generate(text, history)
|
88 |
+
history.append((text, response))
|
89 |
+
|
90 |
+
return "", history
|
91 |
+
|
92 |
+
def text_input(text, history):
|
93 |
+
response = generate(text, history)
|
94 |
+
history.append((text, response))
|
95 |
+
return "", history
|
96 |
+
|
97 |
+
async def generate_audio(history):
|
98 |
+
if len(history) > 0:
|
99 |
+
last_response = history[-1][1]
|
100 |
+
audio_path = await text_to_speech(last_response)
|
101 |
+
return audio_path
|
102 |
+
return None
|
103 |
+
|
104 |
+
# Gradio interface setup
|
105 |
+
with gr.Blocks() as demo:
|
106 |
+
gr.Markdown("# Mood-Based Music Recommender with Voice Chat")
|
107 |
+
|
108 |
+
chatbot = gr.Chatbot()
|
109 |
+
audio_input = gr.Audio(source="microphone", type="filepath")
|
110 |
+
text_input = gr.Textbox(placeholder="Type your message here...")
|
111 |
+
audio_output = gr.Audio(label="AI Response")
|
112 |
|
113 |
+
audio_input.change(chatbot, inputs=[audio_input, chatbot], outputs=[audio_input, chatbot])
|
114 |
+
text_input.submit(text_input, inputs=[text_input, chatbot], outputs=[text_input, chatbot])
|
115 |
+
|
116 |
+
chatbot.change(generate_audio, inputs=[chatbot], outputs=[audio_output])
|
117 |
|
118 |
+
if __name__ == "__main__":
|
119 |
+
demo.launch()
|