Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#pip install openai-whisper
|
2 |
+
#!apt-get install ffmpeg
|
3 |
+
#!pip install playsound
|
4 |
+
#!pip install pydub
|
5 |
+
#!pip install librosa
|
6 |
+
#!pip install gradio
|
7 |
+
#!pip install ollama
|
8 |
+
import gradio as gr
|
9 |
+
import os
|
10 |
+
import librosa
|
11 |
+
import soundfile as sf
|
12 |
+
import numpy as np
|
13 |
+
from pydub import AudioSegment
|
14 |
+
import whisper # For speech-to-text
|
15 |
+
import ollama # For AI text evaluation
|
16 |
+
|
17 |
+
# Define the directories where your .wav, background music, and response files are located
|
18 |
+
wav_directory = "./files"
|
19 |
+
bg_directory = "./files/bg"
|
20 |
+
response_directory = "./files/response"
|
21 |
+
|
22 |
+
# Load Whisper model for speech-to-text
|
23 |
+
whisper_model = whisper.load_model("base")
|
24 |
+
|
25 |
+
# Function to list all .wav files in the directory
|
26 |
+
def list_wav_files():
|
27 |
+
return [f for f in os.listdir(wav_directory) if f.endswith('.mp3')]
|
28 |
+
|
29 |
+
# Function to list all background music files in the directory
|
30 |
+
def list_bg_files():
|
31 |
+
bg_files = [f for f in os.listdir(bg_directory) if f.endswith('.mp3')]
|
32 |
+
bg_files.insert(0, "None") # Add "None" as the first option
|
33 |
+
return bg_files
|
34 |
+
|
35 |
+
# Function to adjust the speed and pitch of the selected .wav file and add background music
|
36 |
+
def adjust_audio(selected_file, speed, pitch, bg_music):
|
37 |
+
if selected_file:
|
38 |
+
# Load the selected .wav file using librosa
|
39 |
+
file_path = os.path.join(wav_directory, selected_file)
|
40 |
+
y, sr = librosa.load(file_path, sr=None)
|
41 |
+
|
42 |
+
# Adjust the speed using librosa's time_stretch function
|
43 |
+
if speed != 1.0:
|
44 |
+
y = librosa.effects.time_stretch(y=y, rate=speed)
|
45 |
+
|
46 |
+
# Adjust the pitch using librosa's pitch_shift function
|
47 |
+
if pitch != 0:
|
48 |
+
y = librosa.effects.pitch_shift(y=y, sr=sr, n_steps=pitch)
|
49 |
+
|
50 |
+
# Save the adjusted audio to a temporary file
|
51 |
+
temp_file = os.path.join(wav_directory, "temp_adjusted.wav")
|
52 |
+
sf.write(temp_file, y, sr)
|
53 |
+
|
54 |
+
# If background music is selected and not "None", overlay it
|
55 |
+
if bg_music and bg_music != "None":
|
56 |
+
bg_path = os.path.join(bg_directory, bg_music)
|
57 |
+
modified_voice = AudioSegment.from_file(temp_file)
|
58 |
+
bg = AudioSegment.from_file(bg_path)
|
59 |
+
bg = bg - 20 # Reduce background music volume by 20 dB
|
60 |
+
|
61 |
+
# Ensure the background music is at least as long as the modified voice
|
62 |
+
if len(bg) < len(modified_voice):
|
63 |
+
bg = bg * (len(modified_voice) // len(bg) + 1)
|
64 |
+
bg = bg[:len(modified_voice)]
|
65 |
+
|
66 |
+
# Overlay the background music
|
67 |
+
final_audio = modified_voice.overlay(bg)
|
68 |
+
|
69 |
+
# Save the final audio with background music
|
70 |
+
final_file = os.path.join(wav_directory, "temp_final.wav")
|
71 |
+
final_audio.export(final_file, format="wav")
|
72 |
+
return final_file
|
73 |
+
|
74 |
+
return temp_file
|
75 |
+
return None
|
76 |
+
|
77 |
+
# Function to evaluate the original audio and return the appropriate response file and message
|
78 |
+
def evaluate_audio(selected_file):
|
79 |
+
if selected_file:
|
80 |
+
# Transcribe the selected .mp3 file using Whisper
|
81 |
+
file_path = os.path.join(wav_directory, selected_file)
|
82 |
+
result = whisper_model.transcribe(file_path)
|
83 |
+
english_text = result["text"]
|
84 |
+
|
85 |
+
# Create a prompt to evaluate if the text is meaningful English
|
86 |
+
prompt = f'''
|
87 |
+
Text: {english_text}
|
88 |
+
|
89 |
+
Instructions:
|
90 |
+
1. Read the text above.
|
91 |
+
2. If the text is not in meaningful English, write only 'no'.
|
92 |
+
3. If the text is in meaningful English, write only 'yes'.
|
93 |
+
4. Do not write anything else except 'no' or 'yes'.
|
94 |
+
'''
|
95 |
+
|
96 |
+
# Use Ollama to evaluate the text
|
97 |
+
response = ollama.chat(model='phi3', messages=[{'role': 'user', 'content': prompt}]).message.content
|
98 |
+
|
99 |
+
print(response)
|
100 |
+
|
101 |
+
# Determine which response file to play and the appropriate message
|
102 |
+
if "yes" in response.lower():
|
103 |
+
message = "Your inquiry is in the English language."
|
104 |
+
response_file = os.path.join(response_directory, "res.mp3")
|
105 |
+
else:
|
106 |
+
message = "Your inquiry is in Piglatin language."
|
107 |
+
response_file = os.path.join(response_directory, "nres.mp3")
|
108 |
+
|
109 |
+
return response_file, message
|
110 |
+
return None, ""
|
111 |
+
|
112 |
+
# Create the Gradio interface
|
113 |
+
with gr.Blocks() as demo:
|
114 |
+
# Dropdown to list available .wav files
|
115 |
+
available_musics = gr.Dropdown(choices=list_wav_files(), label="Available Voices")
|
116 |
+
|
117 |
+
# Dropdown to select playback speed
|
118 |
+
speed_options = [1, 1.25, 1.5, 1.75, 2]
|
119 |
+
speed_selector = gr.Dropdown(choices=speed_options, label="Select Playback Speed", value=1)
|
120 |
+
|
121 |
+
# Dropdown to select pitch shift
|
122 |
+
pitch_options = [0, 1, 2, 3, 4, 5]
|
123 |
+
pitch_selector = gr.Dropdown(choices=pitch_options, label="Select Pitch Shift", value=0)
|
124 |
+
|
125 |
+
# Dropdown to select background music
|
126 |
+
bg_music_selector = gr.Dropdown(choices=list_bg_files(), label="Select Background Sound")
|
127 |
+
|
128 |
+
# Audio component to play the selected .wav file
|
129 |
+
audio_player = gr.Audio(label="")
|
130 |
+
|
131 |
+
# Link the dropdowns to the audio player
|
132 |
+
available_musics.change(fn=adjust_audio, inputs=[available_musics, speed_selector, pitch_selector, bg_music_selector], outputs=audio_player)
|
133 |
+
speed_selector.change(fn=adjust_audio, inputs=[available_musics, speed_selector, pitch_selector, bg_music_selector], outputs=audio_player)
|
134 |
+
pitch_selector.change(fn=adjust_audio, inputs=[available_musics, speed_selector, pitch_selector, bg_music_selector], outputs=audio_player)
|
135 |
+
bg_music_selector.change(fn=adjust_audio, inputs=[available_musics, speed_selector, pitch_selector, bg_music_selector], outputs=audio_player)
|
136 |
+
|
137 |
+
# New section for AI-generated response
|
138 |
+
with gr.Group():
|
139 |
+
gr.Markdown("### AI Generated Response By Voice Agent")
|
140 |
+
ai_response_audio = gr.Audio(label="AI Response")
|
141 |
+
ai_response_message = gr.Markdown("") # Placeholder for the message
|
142 |
+
|
143 |
+
available_musics.change(fn=evaluate_audio, inputs=available_musics, outputs=[ai_response_audio, ai_response_message])
|
144 |
+
|
145 |
+
with gr.Group():
|
146 |
+
gr.Markdown("### Project Explanation")
|
147 |
+
gr.Markdown("""
|
148 |
+
As per the assignment requirements, I have developed a web interface that allows users to customize voice settings, including:
|
149 |
+
|
150 |
+
- **Voice Selection (choose from available voices)
|
151 |
+
- **Speed Modification (adjust speaking speed)
|
152 |
+
- **Pitch Adjustment (alter voice pitch)
|
153 |
+
- **Background Sound Addition (enhance the audio with background effects)
|
154 |
+
- **Handling Pig Latin Language Input
|
155 |
+
The assignment did not explicitly specify how to handle user speech in Pig Latin. Based on my assumption:
|
156 |
+
|
157 |
+
- **If the user's speech is in Pig Latin, the voice assistant will respond in Pig Latin with:
|
158 |
+
"Thank you, I received your query. I will get back to you soon."
|
159 |
+
- **If the user's speech is in English, the voice assistant will reply in Pig Latin:
|
160 |
+
"Sorry, please speak in Pig Latin. I cannot understand otherwise."
|
161 |
+
Technology Stack
|
162 |
+
The project utilizes the following technologies:
|
163 |
+
|
164 |
+
Audio Processing: librosa, pydub, soundfile
|
165 |
+
Speech-to-Text Model: Whisper (for transcribing speech)
|
166 |
+
LLM Model: PH3 (for generating AI responses)
|
167 |
+
Optimized for Efficiency: Due to computational constraints, I used lightweight deep learning and NLP models. However, these can be upgraded for improved performance.
|
168 |
+
How to Use the Voice Assistant
|
169 |
+
- **Select a voice from the dropdown menu.
|
170 |
+
- **Adjust the speed of the voice.
|
171 |
+
- **Modify the pitch as desired.
|
172 |
+
- **Add background sound if needed.
|
173 |
+
- **Listen to the modified voice output.
|
174 |
+
- **Receive an AI-generated response from the voice assistant.
|
175 |
+
|
176 |
+
""")
|
177 |
+
|
178 |
+
# Launch the Gradio interface
|
179 |
+
demo.launch(share=True)
|