Spaces:
Sleeping
Sleeping
#pip install openai-whisper | |
#!apt-get install ffmpeg | |
#!pip install playsound | |
#!pip install pydub | |
#!pip install librosa | |
#!pip install gradio | |
#!pip install ollama | |
import gradio as gr | |
import os | |
import librosa | |
import soundfile as sf | |
import numpy as np | |
from pydub import AudioSegment | |
import whisper # For speech-to-text | |
import ollama # For AI text evaluation | |
# Start Ollama server in the background | |
os.system("ollama serve &") | |
# Define the directories where your .wav, background music, and response files are located | |
wav_directory = "./files" | |
bg_directory = "./files/bg" | |
response_directory = "./files/response" | |
# Load Whisper model for speech-to-text | |
whisper_model = whisper.load_model("base") | |
# Function to list all .wav files in the directory | |
def list_wav_files(): | |
return [f for f in os.listdir(wav_directory) if f.endswith('.mp3')] | |
# Function to list all background music files in the directory | |
def list_bg_files(): | |
bg_files = [f for f in os.listdir(bg_directory) if f.endswith('.mp3')] | |
bg_files.insert(0, "None") # Add "None" as the first option | |
return bg_files | |
# Function to adjust the speed and pitch of the selected .wav file and add background music | |
def adjust_audio(selected_file, speed, pitch, bg_music): | |
if selected_file: | |
# Load the selected .wav file using librosa | |
file_path = os.path.join(wav_directory, selected_file) | |
y, sr = librosa.load(file_path, sr=None) | |
# Adjust the speed using librosa's time_stretch function | |
if speed != 1.0: | |
y = librosa.effects.time_stretch(y=y, rate=speed) | |
# Adjust the pitch using librosa's pitch_shift function | |
if pitch != 0: | |
y = librosa.effects.pitch_shift(y=y, sr=sr, n_steps=pitch) | |
# Save the adjusted audio to a temporary file | |
temp_file = os.path.join(wav_directory, "temp_adjusted.wav") | |
sf.write(temp_file, y, sr) | |
# If background music is selected and not "None", overlay it | |
if bg_music and bg_music != "None": | |
bg_path = os.path.join(bg_directory, bg_music) | |
modified_voice = AudioSegment.from_file(temp_file) | |
bg = AudioSegment.from_file(bg_path) | |
bg = bg - 20 # Reduce background music volume by 20 dB | |
# Ensure the background music is at least as long as the modified voice | |
if len(bg) < len(modified_voice): | |
bg = bg * (len(modified_voice) // len(bg) + 1) | |
bg = bg[:len(modified_voice)] | |
# Overlay the background music | |
final_audio = modified_voice.overlay(bg) | |
# Save the final audio with background music | |
final_file = os.path.join(wav_directory, "temp_final.wav") | |
final_audio.export(final_file, format="wav") | |
return final_file | |
return temp_file | |
return None | |
# Function to evaluate the original audio and return the appropriate response file and message | |
def evaluate_audio(selected_file): | |
if selected_file: | |
# Transcribe the selected .mp3 file using Whisper | |
file_path = os.path.join(wav_directory, selected_file) | |
result = whisper_model.transcribe(file_path) | |
english_text = result["text"] | |
# Create a prompt to evaluate if the text is meaningful English | |
prompt = f''' | |
Text: {english_text} | |
Instructions: | |
1. Read the text above. | |
2. If the text is not in meaningful English, write only 'no'. | |
3. If the text is in meaningful English, write only 'yes'. | |
4. Do not write anything else except 'no' or 'yes'. | |
''' | |
# Use Ollama to evaluate the text | |
response = ollama.chat(model='phi3', messages=[{'role': 'user', 'content': prompt}]).message.content | |
print(response) | |
# Determine which response file to play and the appropriate message | |
if "yes" in response.lower(): | |
message = "Your inquiry is in the English language." | |
response_file = os.path.join(response_directory, "res.mp3") | |
else: | |
message = "Your inquiry is in Piglatin language." | |
response_file = os.path.join(response_directory, "nres.mp3") | |
return response_file, message | |
return None, "" | |
# Create the Gradio interface | |
with gr.Blocks() as demo: | |
# Dropdown to list available .wav files | |
available_musics = gr.Dropdown(choices=list_wav_files(), label="Available Voices") | |
# Dropdown to select playback speed | |
speed_options = [1, 1.25, 1.5, 1.75, 2] | |
speed_selector = gr.Dropdown(choices=speed_options, label="Select Playback Speed", value=1) | |
# Dropdown to select pitch shift | |
pitch_options = [0, 1, 2, 3, 4, 5] | |
pitch_selector = gr.Dropdown(choices=pitch_options, label="Select Pitch Shift", value=0) | |
# Dropdown to select background music | |
bg_music_selector = gr.Dropdown(choices=list_bg_files(), label="Select Background Sound") | |
# Audio component to play the selected .wav file | |
audio_player = gr.Audio(label="") | |
# Link the dropdowns to the audio player | |
available_musics.change(fn=adjust_audio, inputs=[available_musics, speed_selector, pitch_selector, bg_music_selector], outputs=audio_player) | |
speed_selector.change(fn=adjust_audio, inputs=[available_musics, speed_selector, pitch_selector, bg_music_selector], outputs=audio_player) | |
pitch_selector.change(fn=adjust_audio, inputs=[available_musics, speed_selector, pitch_selector, bg_music_selector], outputs=audio_player) | |
bg_music_selector.change(fn=adjust_audio, inputs=[available_musics, speed_selector, pitch_selector, bg_music_selector], outputs=audio_player) | |
# New section for AI-generated response | |
with gr.Group(): | |
gr.Markdown("### AI Generated Response By Voice Agent") | |
ai_response_audio = gr.Audio(label="AI Response") | |
ai_response_message = gr.Markdown("") # Placeholder for the message | |
available_musics.change(fn=evaluate_audio, inputs=available_musics, outputs=[ai_response_audio, ai_response_message]) | |
with gr.Group(): | |
gr.Markdown("### Project Explanation") | |
gr.Markdown(""" | |
As per the assignment requirements, I have developed a web interface that allows users to customize voice settings, including: | |
- **Voice Selection (choose from available voices) | |
- **Speed Modification (adjust speaking speed) | |
- **Pitch Adjustment (alter voice pitch) | |
- **Background Sound Addition (enhance the audio with background effects) | |
- **Handling Pig Latin Language Input | |
The assignment did not explicitly specify how to handle user speech in Pig Latin. Based on my assumption: | |
- **If the user's speech is in Pig Latin, the voice assistant will respond in Pig Latin with: | |
"Thank you, I received your query. I will get back to you soon." | |
- **If the user's speech is in English, the voice assistant will reply in Pig Latin: | |
"Sorry, please speak in Pig Latin. I cannot understand otherwise." | |
Technology Stack | |
The project utilizes the following technologies: | |
Audio Processing: librosa, pydub, soundfile | |
Speech-to-Text Model: Whisper (for transcribing speech) | |
LLM Model: PH3 (for generating AI responses) | |
Optimized for Efficiency: Due to computational constraints, I used lightweight deep learning and NLP models. However, these can be upgraded for improved performance. | |
How to Use the Voice Assistant | |
- **Select a voice from the dropdown menu. | |
- **Adjust the speed of the voice. | |
- **Modify the pitch as desired. | |
- **Add background sound if needed. | |
- **Listen to the modified voice output. | |
- **Receive an AI-generated response from the voice assistant. | |
""") | |
# Launch the Gradio interface | |
demo.launch(share=True) |