cosmox_prj / app.py
rame's picture
Update app.py
cc5b9c5 verified
#pip install openai-whisper
#!apt-get install ffmpeg
#!pip install playsound
#!pip install pydub
#!pip install librosa
#!pip install gradio
#!pip install ollama
import gradio as gr
import os
import librosa
import soundfile as sf
import numpy as np
from pydub import AudioSegment
import whisper # For speech-to-text
import ollama # For AI text evaluation
# Start Ollama server in the background
os.system("ollama serve &")
# Define the directories where your .wav, background music, and response files are located
wav_directory = "./files"
bg_directory = "./files/bg"
response_directory = "./files/response"
# Load Whisper model for speech-to-text
whisper_model = whisper.load_model("base")
# Function to list all .wav files in the directory
def list_wav_files():
return [f for f in os.listdir(wav_directory) if f.endswith('.mp3')]
# Function to list all background music files in the directory
def list_bg_files():
bg_files = [f for f in os.listdir(bg_directory) if f.endswith('.mp3')]
bg_files.insert(0, "None") # Add "None" as the first option
return bg_files
# Function to adjust the speed and pitch of the selected .wav file and add background music
def adjust_audio(selected_file, speed, pitch, bg_music):
if selected_file:
# Load the selected .wav file using librosa
file_path = os.path.join(wav_directory, selected_file)
y, sr = librosa.load(file_path, sr=None)
# Adjust the speed using librosa's time_stretch function
if speed != 1.0:
y = librosa.effects.time_stretch(y=y, rate=speed)
# Adjust the pitch using librosa's pitch_shift function
if pitch != 0:
y = librosa.effects.pitch_shift(y=y, sr=sr, n_steps=pitch)
# Save the adjusted audio to a temporary file
temp_file = os.path.join(wav_directory, "temp_adjusted.wav")
sf.write(temp_file, y, sr)
# If background music is selected and not "None", overlay it
if bg_music and bg_music != "None":
bg_path = os.path.join(bg_directory, bg_music)
modified_voice = AudioSegment.from_file(temp_file)
bg = AudioSegment.from_file(bg_path)
bg = bg - 20 # Reduce background music volume by 20 dB
# Ensure the background music is at least as long as the modified voice
if len(bg) < len(modified_voice):
bg = bg * (len(modified_voice) // len(bg) + 1)
bg = bg[:len(modified_voice)]
# Overlay the background music
final_audio = modified_voice.overlay(bg)
# Save the final audio with background music
final_file = os.path.join(wav_directory, "temp_final.wav")
final_audio.export(final_file, format="wav")
return final_file
return temp_file
return None
# Function to evaluate the original audio and return the appropriate response file and message
def evaluate_audio(selected_file):
if selected_file:
# Transcribe the selected .mp3 file using Whisper
file_path = os.path.join(wav_directory, selected_file)
result = whisper_model.transcribe(file_path)
english_text = result["text"]
# Create a prompt to evaluate if the text is meaningful English
prompt = f'''
Text: {english_text}
Instructions:
1. Read the text above.
2. If the text is not in meaningful English, write only 'no'.
3. If the text is in meaningful English, write only 'yes'.
4. Do not write anything else except 'no' or 'yes'.
'''
# Use Ollama to evaluate the text
response = ollama.chat(model='phi3', messages=[{'role': 'user', 'content': prompt}]).message.content
print(response)
# Determine which response file to play and the appropriate message
if "yes" in response.lower():
message = "Your inquiry is in the English language."
response_file = os.path.join(response_directory, "res.mp3")
else:
message = "Your inquiry is in Piglatin language."
response_file = os.path.join(response_directory, "nres.mp3")
return response_file, message
return None, ""
# Create the Gradio interface
with gr.Blocks() as demo:
# Dropdown to list available .wav files
available_musics = gr.Dropdown(choices=list_wav_files(), label="Available Voices")
# Dropdown to select playback speed
speed_options = [1, 1.25, 1.5, 1.75, 2]
speed_selector = gr.Dropdown(choices=speed_options, label="Select Playback Speed", value=1)
# Dropdown to select pitch shift
pitch_options = [0, 1, 2, 3, 4, 5]
pitch_selector = gr.Dropdown(choices=pitch_options, label="Select Pitch Shift", value=0)
# Dropdown to select background music
bg_music_selector = gr.Dropdown(choices=list_bg_files(), label="Select Background Sound")
# Audio component to play the selected .wav file
audio_player = gr.Audio(label="")
# Link the dropdowns to the audio player
available_musics.change(fn=adjust_audio, inputs=[available_musics, speed_selector, pitch_selector, bg_music_selector], outputs=audio_player)
speed_selector.change(fn=adjust_audio, inputs=[available_musics, speed_selector, pitch_selector, bg_music_selector], outputs=audio_player)
pitch_selector.change(fn=adjust_audio, inputs=[available_musics, speed_selector, pitch_selector, bg_music_selector], outputs=audio_player)
bg_music_selector.change(fn=adjust_audio, inputs=[available_musics, speed_selector, pitch_selector, bg_music_selector], outputs=audio_player)
# New section for AI-generated response
with gr.Group():
gr.Markdown("### AI Generated Response By Voice Agent")
ai_response_audio = gr.Audio(label="AI Response")
ai_response_message = gr.Markdown("") # Placeholder for the message
available_musics.change(fn=evaluate_audio, inputs=available_musics, outputs=[ai_response_audio, ai_response_message])
with gr.Group():
gr.Markdown("### Project Explanation")
gr.Markdown("""
As per the assignment requirements, I have developed a web interface that allows users to customize voice settings, including:
- **Voice Selection (choose from available voices)
- **Speed Modification (adjust speaking speed)
- **Pitch Adjustment (alter voice pitch)
- **Background Sound Addition (enhance the audio with background effects)
- **Handling Pig Latin Language Input
The assignment did not explicitly specify how to handle user speech in Pig Latin. Based on my assumption:
- **If the user's speech is in Pig Latin, the voice assistant will respond in Pig Latin with:
"Thank you, I received your query. I will get back to you soon."
- **If the user's speech is in English, the voice assistant will reply in Pig Latin:
"Sorry, please speak in Pig Latin. I cannot understand otherwise."
Technology Stack
The project utilizes the following technologies:
Audio Processing: librosa, pydub, soundfile
Speech-to-Text Model: Whisper (for transcribing speech)
LLM Model: PH3 (for generating AI responses)
Optimized for Efficiency: Due to computational constraints, I used lightweight deep learning and NLP models. However, these can be upgraded for improved performance.
How to Use the Voice Assistant
- **Select a voice from the dropdown menu.
- **Adjust the speed of the voice.
- **Modify the pitch as desired.
- **Add background sound if needed.
- **Listen to the modified voice output.
- **Receive an AI-generated response from the voice assistant.
""")
# Launch the Gradio interface
demo.launch(share=True)