syedmudassir16
commited on
dr_infer_voice_to_text
Browse files
app.py
CHANGED
@@ -1,10 +1,31 @@
|
|
1 |
-
|
|
|
|
|
2 |
import gradio as gr
|
3 |
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
You are a smart mood analyser, who determines user mood. Based on the user input, classify the mood of the user into one of the four moods {Happy, Sad, Instrumental, Party}. If you are finding it difficult to classify into one of these four moods, keep the conversation going on until we classify the user’s mood. Return a single-word reply from one of the options if you have classified. Suppose you classify a sentence as happy, then just respond with "happy".
|
9 |
|
10 |
Note: Do not write anything else other than the classified mood if classified.
|
@@ -174,14 +195,43 @@ def format_prompt(message, history):
|
|
174 |
User: Lets turn up the music and have some fun!
|
175 |
LLM Response: Party
|
176 |
"""
|
177 |
-
|
178 |
-
|
179 |
-
prompt
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
185 |
def classify_mood(input_string):
|
186 |
input_string = input_string.lower()
|
187 |
mood_words = {"happy", "sad", "instrumental", "party"}
|
@@ -190,48 +240,43 @@ def classify_mood(input_string):
|
|
190 |
return word, True
|
191 |
return None, False
|
192 |
|
193 |
-
def
|
194 |
-
|
195 |
-
)
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
if is_classified:
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
clear_btn=None,
|
234 |
-
description="<span style='font-size: larger; font-weight: bold;'>Hi! I'm a music recommender app. What kind of music do you want to listen to, or how are you feeling today?</span>",
|
235 |
-
)
|
236 |
-
|
237 |
-
demo.queue().launch()
|
|
|
1 |
+
import argparse
|
2 |
+
import torch
|
3 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
|
4 |
import gradio as gr
|
5 |
|
6 |
+
class llmChatbot:
|
7 |
+
def __init__(self, model_name, temperature=0.3, max_new_tokens=256, top_p=0.95, repetition_penalty=1.0):
|
8 |
+
# Specify how to quantize the model
|
9 |
+
quantization_config = BitsAndBytesConfig(
|
10 |
+
load_in_4bit=True,
|
11 |
+
bnb_4bit_quant_type="nf4",
|
12 |
+
bnb_4bit_compute_dtype="float16", # Use the string "float16" instead of torch.float16
|
13 |
+
)
|
14 |
+
|
15 |
+
self.model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config, device_map="auto")
|
16 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
17 |
+
|
18 |
+
# Set pad_token to eos_token if not already set
|
19 |
+
if (self.tokenizer.pad_token is None):
|
20 |
+
self.tokenizer.pad_token = self.tokenizer.eos_token
|
21 |
+
|
22 |
+
self.temperature = temperature
|
23 |
+
self.max_new_tokens = max_new_tokens
|
24 |
+
self.top_p = top_p
|
25 |
+
self.repetition_penalty = repetition_penalty
|
26 |
+
|
27 |
+
def format_prompt(self, message, history):
|
28 |
+
fixed_prompt = """
|
29 |
You are a smart mood analyser, who determines user mood. Based on the user input, classify the mood of the user into one of the four moods {Happy, Sad, Instrumental, Party}. If you are finding it difficult to classify into one of these four moods, keep the conversation going on until we classify the user’s mood. Return a single-word reply from one of the options if you have classified. Suppose you classify a sentence as happy, then just respond with "happy".
|
30 |
|
31 |
Note: Do not write anything else other than the classified mood if classified.
|
|
|
195 |
User: Lets turn up the music and have some fun!
|
196 |
LLM Response: Party
|
197 |
"""
|
198 |
+
|
199 |
+
# Start with the fixed prompt
|
200 |
+
prompt = f"<s>{fixed_prompt}"
|
201 |
+
|
202 |
+
# Append the conversation history
|
203 |
+
for user_prompt, bot_response in history:
|
204 |
+
prompt += f"\nUser: {user_prompt}\nLLM Response: {bot_response}"
|
205 |
+
|
206 |
+
# Add the current message
|
207 |
+
prompt += f"\nUser: {message}\nLLM Response:"
|
208 |
+
|
209 |
+
return prompt
|
210 |
+
|
211 |
+
def generate(self, message, history, temperature=None, max_new_tokens=None, top_p=None, repetition_penalty=None):
|
212 |
+
if temperature is None:
|
213 |
+
temperature = self.temperature
|
214 |
+
if max_new_tokens is None:
|
215 |
+
max_new_tokens = self.max_new_tokens
|
216 |
+
if top_p is None:
|
217 |
+
top_p = self.top_p
|
218 |
+
if repetition_penalty is None:
|
219 |
+
repetition_penalty = self.repetition_penalty
|
220 |
+
|
221 |
+
prompt = self.format_prompt(message, history)
|
222 |
+
inputs = self.tokenizer(prompt, return_tensors="pt", padding=True).to("cuda")
|
223 |
+
generate_kwargs = dict(
|
224 |
+
temperature=temperature,
|
225 |
+
max_new_tokens=max_new_tokens,
|
226 |
+
top_p=top_p,
|
227 |
+
repetition_penalty=repetition_penalty,
|
228 |
+
do_sample=True,
|
229 |
+
pad_token_id=self.tokenizer.pad_token_id, # Explicitly set the pad_token_id
|
230 |
+
)
|
231 |
+
output_ids = self.model.generate(**inputs, **generate_kwargs)
|
232 |
+
output = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)
|
233 |
+
return output[len(prompt):].strip()
|
234 |
+
|
235 |
def classify_mood(input_string):
|
236 |
input_string = input_string.lower()
|
237 |
mood_words = {"happy", "sad", "instrumental", "party"}
|
|
|
240 |
return word, True
|
241 |
return None, False
|
242 |
|
243 |
+
def speech_to_text(speech):
|
244 |
+
asr = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")
|
245 |
+
text = asr(speech)["text"]
|
246 |
+
return text
|
247 |
+
|
248 |
+
def text_to_speech(text):
|
249 |
+
tts = pipeline("text-to-speech", model="facebook/fastspeech2-en-ljspeech")
|
250 |
+
speech = tts(text)["audio"]
|
251 |
+
return speech
|
252 |
+
|
253 |
+
if __name__ == "__main__":
|
254 |
+
parser = argparse.ArgumentParser(description="Start the Mistral chatbot application.")
|
255 |
+
parser.add_argument("--model_name", type=str, default="mistralai/Mistral-7B-Instruct-v0.2", help="The name of the model to use.")
|
256 |
+
|
257 |
+
args = parser.parse_args()
|
258 |
+
model_name = args.model_name
|
259 |
+
|
260 |
+
# Instantiate the chatbot with necessary parameters
|
261 |
+
mistral_chatbot = llmChatbot(model_name=model_name)
|
262 |
+
history = []
|
263 |
+
print("How are you doing today?")
|
264 |
+
|
265 |
+
def chatbot_response(audio_input):
|
266 |
+
text_input = speech_to_text(audio_input)
|
267 |
+
result = mistral_chatbot.generate(text_input, history)
|
268 |
+
mood, is_classified = classify_mood(result)
|
269 |
if is_classified:
|
270 |
+
response_text = mood.capitalize()
|
271 |
+
else:
|
272 |
+
response_text = result
|
273 |
+
audio_output = text_to_speech(response_text)
|
274 |
+
history.append((text_input, response_text))
|
275 |
+
return audio_output, response_text
|
276 |
+
|
277 |
+
gr.Interface(
|
278 |
+
fn=chatbot_response,
|
279 |
+
inputs=gr.Audio(source="microphone", type="filepath"),
|
280 |
+
outputs=[gr.Audio(type="numpy"), "text"],
|
281 |
+
live=True
|
282 |
+
).launch()
|
|
|
|
|
|
|
|
|
|