Voice-To-Voice_test

Sleeping

App Files Files Community

syedmudassir16 commited on Sep 24, 2024

Commit

f8d538d

verified ·

1 Parent(s): fb5cd3b

Update app.py

Browse files

Files changed (1) hide show

app.py +103 -94

app.py CHANGED Viewed

@@ -1,110 +1,119 @@
-import os
 import gradio as gr
-import whisper
-from gtts import gTTS
-import io
-from groq import Groq
-# Initialize the Groq client
-client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
-# Load the Whisper model
-model = whisper.load_model("base")
-def process_audio(file_path):
-    try:
-        # Load the audio file
-        audio = whisper.load_audio(file_path)
-        # Transcribe the audio using Whisper
-        result = model.transcribe(audio)
-        text = result["text"]
-        # Generate a response using Groq
-        chat_completion = client.chat.completions.create(
-            messages=[{"role": "user", "content": text}],
-            model="llama3-8b-8192",  # Replace with the correct model if necessary
-        )
-        # Access the response using dot notation
-        response_message = chat_completion.choices[0].message.content.strip()
-        # Convert the response text to speech
-        tts = gTTS(response_message)
-        response_audio_io = io.BytesIO()
-        tts.write_to_fp(response_audio_io)  # Save the audio to the BytesIO object
-        response_audio_io.seek(0)
-        # Save audio to a file to ensure it's generated correctly
-        response_audio_path = "response.mp3"
-        with open(response_audio_path, "wb") as audio_file:
-            audio_file.write(response_audio_io.getvalue())
-        # Return the response text and the path to the saved audio file
-        return response_message, response_audio_path
-    except Exception as e:
-        return f"An error occurred: {e}", None
-# Create the Gradio interface with customized UI
-with gr.Blocks() as demo:
-    gr.Markdown(
-        """
-        <style>
-        .gradio-container {
-            font-family: Arial, sans-serif;
-            background-color: #f0f4c3;  /* Light green background color */
-            border-radius: 10px;
-            padding: 20px;
-            box-shadow: 0 4px 12px rgba(0,0,0,0.2);
-            text-align: center;
-        }
-        .gradio-input, .gradio-output {
-            border-radius: 6px;
-            border: 1px solid #ddd;
-            padding: 10px;
-        }
-        .gradio-button {
-            background-color: #ff7043;
-            color: white;
-            border-radius: 6px;
-            border: none;
-            padding: 10px 20px;  /* Adjusted padding */
-            font-size: 16px;  /* Adjusted font size */
-            cursor: pointer;
-        }
-        .gradio-button:hover {
-            background-color: #e64a19;
-        }
-        .gradio-title {
-            font-size: 28px;
-            font-weight: bold;
-            margin-bottom: 20px;
-            color: #37474f;
-        }
-        .gradio-description {
-            font-size: 16px;
-            margin-bottom: 20px;
-            color: #616161;
-        }
-        </style>
-        """
-    )
-    gr.Markdown("# Voice-to-Voice Chatbot\nDeveloped by Salman Maqbool ❤️")
-    gr.Markdown("Upload an audio file to interact with the voice-to-voice chatbot. The chatbot will transcribe the audio, generate a response, and provide a spoken reply.")
-    with gr.Row():
-        with gr.Column():
-            audio_input = gr.Audio(type="filepath", label="Upload Audio File")
-            submit_button = gr.Button("Submit")
-        with gr.Column():
-            response_text = gr.Textbox(label="Response Text", placeholder="The AI-generated response will appear here", lines=5)
-            response_audio = gr.Audio(label="Response Audio", type="filepath")
-    # Link the submit button to the process_audio function
-    submit_button.click(fn=process_audio, inputs=audio_input, outputs=[response_text, response_audio])
-# Launch the Gradio app
-demo.launch()

 import gradio as gr
+from huggingface_hub import InferenceClient
+from transformers import pipeline
+import edge_tts
+import tempfile
+import asyncio
+# Initialize the inference client with your Hugging Face token
+client = InferenceClient("mistralai/Mistral-7B-Instruct-v0.1")
+# Initialize the ASR pipeline
+asr = pipeline("automatic-speech-recognition", "facebook/wav2vec2-base-960h")
+def speech_to_text(speech):
+    """Converts speech to text using the ASR pipeline."""
+    return asr(speech)["text"]
+def classify_mood(input_string):
+    """Classifies the mood based on keywords in the input string."""
+    input_string = input_string.lower()
+    mood_words = {"happy", "sad", "instrumental", "party"}
+    for word in mood_words:
+        if word in input_string:
+            return word, True
+    return None, False
+def generate(prompt, history, temperature=0.1, max_new_tokens=2048, top_p=0.8, repetition_penalty=1.0):
+    temperature = float(temperature)
+    if temperature < 1e-2:
+        temperature = 1e-2
+    top_p = float(top_p)
+    generate_kwargs = dict(
+        temperature=temperature,
+        max_new_tokens=max_new_tokens,
+        top_p=top_p,
+        repetition_penalty=repetition_penalty,
+        do_sample=True,
+        seed=42,
+    )
+    formatted_prompt = format_prompt(prompt, history)
+    stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
+    output = ""
+    for response in stream:
+        output += response.token.text
+        mood, is_classified = classify_mood(output)
+        if is_classified:
+            playlist_message = f"Playing {mood.capitalize()} playlist for you!"
+            return playlist_message
+    return output
+def format_prompt(message, history):
+    """Formats the prompt including fixed instructions and conversation history."""
+    fixed_prompt = """
+    You are a smart mood analyser, who determines user mood. Based on the user input, classify the mood of the user into one of the four moods {Happy, Sad, Instrumental, Party}. If you are finding it difficult to classify into one of these four moods, keep the conversation going on until we classify the user's mood. Return a single-word reply from one of the options if you have classified. Suppose you classify a sentence as happy, then just respond with "happy".
+    Note: Do not write anything else other than the classified mood if classified.
+    Note: If any question or any user text cannot be classified, follow up with a question to know the user's mood until you classify the mood.
+    Note: Mood should be classified only from any of these 4 classes {Happy, Sad, Instrumental, Party}, if not any of these 4 then continue with a follow-up question until you classify the mood.
+    Note: if user asks something like i need a coffee then do not classify the mood directly and ask more follow-up questions as asked in examples.
+    [Examples omitted for brevity]
+    """
+    prompt = f"{fixed_prompt}"
+    for user_prompt, bot_response in history:
+        prompt += f"\nUser: {user_prompt}\nLLM Response: {bot_response}"
+    prompt += f"\nUser: {message}\nLLM Response:"
+    return prompt
+async def text_to_speech(text):
+    communicate = edge_tts.Communicate(text)
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
+        tmp_path = tmp_file.name
+        await communicate.save(tmp_path)
+    return tmp_path
+def chatbot(audio, history):
+    if audio is None:
+        return "", history
+    text = speech_to_text(audio)
+    response = generate(text, history)
+    history.append((text, response))
+    return "", history
+def text_input(text, history):
+    response = generate(text, history)
+    history.append((text, response))
+    return "", history
+async def generate_audio(history):
+    if len(history) > 0:
+        last_response = history[-1][1]
+        audio_path = await text_to_speech(last_response)
+        return audio_path
+    return None
+# Gradio interface setup
+with gr.Blocks() as demo:
+    gr.Markdown("# Mood-Based Music Recommender with Voice Chat")
+    chatbot = gr.Chatbot()
+    audio_input = gr.Audio(source="microphone", type="filepath")
+    text_input = gr.Textbox(placeholder="Type your message here...")
+    audio_output = gr.Audio(label="AI Response")
+    audio_input.change(chatbot, inputs=[audio_input, chatbot], outputs=[audio_input, chatbot])
+    text_input.submit(text_input, inputs=[text_input, chatbot], outputs=[text_input, chatbot])
+    chatbot.change(generate_audio, inputs=[chatbot], outputs=[audio_output])
+if __name__ == "__main__":
+    demo.launch()