#pip install openai-whisper #!apt-get install ffmpeg #!pip install playsound #!pip install pydub #!pip install librosa #!pip install gradio #!pip install ollama import gradio as gr import os import librosa import soundfile as sf import numpy as np from pydub import AudioSegment import whisper # For speech-to-text import ollama # For AI text evaluation # Start Ollama server in the background os.system("ollama serve &") # Define the directories where your .wav, background music, and response files are located wav_directory = "./files" bg_directory = "./files/bg" response_directory = "./files/response" # Load Whisper model for speech-to-text whisper_model = whisper.load_model("base") # Function to list all .wav files in the directory def list_wav_files(): return [f for f in os.listdir(wav_directory) if f.endswith('.mp3')] # Function to list all background music files in the directory def list_bg_files(): bg_files = [f for f in os.listdir(bg_directory) if f.endswith('.mp3')] bg_files.insert(0, "None") # Add "None" as the first option return bg_files # Function to adjust the speed and pitch of the selected .wav file and add background music def adjust_audio(selected_file, speed, pitch, bg_music): if selected_file: # Load the selected .wav file using librosa file_path = os.path.join(wav_directory, selected_file) y, sr = librosa.load(file_path, sr=None) # Adjust the speed using librosa's time_stretch function if speed != 1.0: y = librosa.effects.time_stretch(y=y, rate=speed) # Adjust the pitch using librosa's pitch_shift function if pitch != 0: y = librosa.effects.pitch_shift(y=y, sr=sr, n_steps=pitch) # Save the adjusted audio to a temporary file temp_file = os.path.join(wav_directory, "temp_adjusted.wav") sf.write(temp_file, y, sr) # If background music is selected and not "None", overlay it if bg_music and bg_music != "None": bg_path = os.path.join(bg_directory, bg_music) modified_voice = AudioSegment.from_file(temp_file) bg = AudioSegment.from_file(bg_path) bg = bg - 20 # Reduce background music volume by 20 dB # Ensure the background music is at least as long as the modified voice if len(bg) < len(modified_voice): bg = bg * (len(modified_voice) // len(bg) + 1) bg = bg[:len(modified_voice)] # Overlay the background music final_audio = modified_voice.overlay(bg) # Save the final audio with background music final_file = os.path.join(wav_directory, "temp_final.wav") final_audio.export(final_file, format="wav") return final_file return temp_file return None # Function to evaluate the original audio and return the appropriate response file and message def evaluate_audio(selected_file): if selected_file: # Transcribe the selected .mp3 file using Whisper file_path = os.path.join(wav_directory, selected_file) result = whisper_model.transcribe(file_path) english_text = result["text"] # Create a prompt to evaluate if the text is meaningful English prompt = f''' Text: {english_text} Instructions: 1. Read the text above. 2. If the text is not in meaningful English, write only 'no'. 3. If the text is in meaningful English, write only 'yes'. 4. Do not write anything else except 'no' or 'yes'. ''' # Use Ollama to evaluate the text response = ollama.chat(model='phi3', messages=[{'role': 'user', 'content': prompt}]).message.content print(response) # Determine which response file to play and the appropriate message if "yes" in response.lower(): message = "Your inquiry is in the English language." response_file = os.path.join(response_directory, "res.mp3") else: message = "Your inquiry is in Piglatin language." response_file = os.path.join(response_directory, "nres.mp3") return response_file, message return None, "" # Create the Gradio interface with gr.Blocks() as demo: # Dropdown to list available .wav files available_musics = gr.Dropdown(choices=list_wav_files(), label="Available Voices") # Dropdown to select playback speed speed_options = [1, 1.25, 1.5, 1.75, 2] speed_selector = gr.Dropdown(choices=speed_options, label="Select Playback Speed", value=1) # Dropdown to select pitch shift pitch_options = [0, 1, 2, 3, 4, 5] pitch_selector = gr.Dropdown(choices=pitch_options, label="Select Pitch Shift", value=0) # Dropdown to select background music bg_music_selector = gr.Dropdown(choices=list_bg_files(), label="Select Background Sound") # Audio component to play the selected .wav file audio_player = gr.Audio(label="") # Link the dropdowns to the audio player available_musics.change(fn=adjust_audio, inputs=[available_musics, speed_selector, pitch_selector, bg_music_selector], outputs=audio_player) speed_selector.change(fn=adjust_audio, inputs=[available_musics, speed_selector, pitch_selector, bg_music_selector], outputs=audio_player) pitch_selector.change(fn=adjust_audio, inputs=[available_musics, speed_selector, pitch_selector, bg_music_selector], outputs=audio_player) bg_music_selector.change(fn=adjust_audio, inputs=[available_musics, speed_selector, pitch_selector, bg_music_selector], outputs=audio_player) # New section for AI-generated response with gr.Group(): gr.Markdown("### AI Generated Response By Voice Agent") ai_response_audio = gr.Audio(label="AI Response") ai_response_message = gr.Markdown("") # Placeholder for the message available_musics.change(fn=evaluate_audio, inputs=available_musics, outputs=[ai_response_audio, ai_response_message]) with gr.Group(): gr.Markdown("### Project Explanation") gr.Markdown(""" As per the assignment requirements, I have developed a web interface that allows users to customize voice settings, including: - **Voice Selection (choose from available voices) - **Speed Modification (adjust speaking speed) - **Pitch Adjustment (alter voice pitch) - **Background Sound Addition (enhance the audio with background effects) - **Handling Pig Latin Language Input The assignment did not explicitly specify how to handle user speech in Pig Latin. Based on my assumption: - **If the user's speech is in Pig Latin, the voice assistant will respond in Pig Latin with: "Thank you, I received your query. I will get back to you soon." - **If the user's speech is in English, the voice assistant will reply in Pig Latin: "Sorry, please speak in Pig Latin. I cannot understand otherwise." Technology Stack The project utilizes the following technologies: Audio Processing: librosa, pydub, soundfile Speech-to-Text Model: Whisper (for transcribing speech) LLM Model: PH3 (for generating AI responses) Optimized for Efficiency: Due to computational constraints, I used lightweight deep learning and NLP models. However, these can be upgraded for improved performance. How to Use the Voice Assistant - **Select a voice from the dropdown menu. - **Adjust the speed of the voice. - **Modify the pitch as desired. - **Add background sound if needed. - **Listen to the modified voice output. - **Receive an AI-generated response from the voice assistant. """) # Launch the Gradio interface demo.launch(share=True)