rame commited on
Commit
d93856b
·
verified ·
1 Parent(s): e2cfa48

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +179 -0
app.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pip install openai-whisper
2
+ #!apt-get install ffmpeg
3
+ #!pip install playsound
4
+ #!pip install pydub
5
+ #!pip install librosa
6
+ #!pip install gradio
7
+ #!pip install ollama
8
+ import gradio as gr
9
+ import os
10
+ import librosa
11
+ import soundfile as sf
12
+ import numpy as np
13
+ from pydub import AudioSegment
14
+ import whisper # For speech-to-text
15
+ import ollama # For AI text evaluation
16
+
17
+ # Define the directories where your .wav, background music, and response files are located
18
+ wav_directory = "./files"
19
+ bg_directory = "./files/bg"
20
+ response_directory = "./files/response"
21
+
22
+ # Load Whisper model for speech-to-text
23
+ whisper_model = whisper.load_model("base")
24
+
25
+ # Function to list all .wav files in the directory
26
+ def list_wav_files():
27
+ return [f for f in os.listdir(wav_directory) if f.endswith('.mp3')]
28
+
29
+ # Function to list all background music files in the directory
30
+ def list_bg_files():
31
+ bg_files = [f for f in os.listdir(bg_directory) if f.endswith('.mp3')]
32
+ bg_files.insert(0, "None") # Add "None" as the first option
33
+ return bg_files
34
+
35
+ # Function to adjust the speed and pitch of the selected .wav file and add background music
36
+ def adjust_audio(selected_file, speed, pitch, bg_music):
37
+ if selected_file:
38
+ # Load the selected .wav file using librosa
39
+ file_path = os.path.join(wav_directory, selected_file)
40
+ y, sr = librosa.load(file_path, sr=None)
41
+
42
+ # Adjust the speed using librosa's time_stretch function
43
+ if speed != 1.0:
44
+ y = librosa.effects.time_stretch(y=y, rate=speed)
45
+
46
+ # Adjust the pitch using librosa's pitch_shift function
47
+ if pitch != 0:
48
+ y = librosa.effects.pitch_shift(y=y, sr=sr, n_steps=pitch)
49
+
50
+ # Save the adjusted audio to a temporary file
51
+ temp_file = os.path.join(wav_directory, "temp_adjusted.wav")
52
+ sf.write(temp_file, y, sr)
53
+
54
+ # If background music is selected and not "None", overlay it
55
+ if bg_music and bg_music != "None":
56
+ bg_path = os.path.join(bg_directory, bg_music)
57
+ modified_voice = AudioSegment.from_file(temp_file)
58
+ bg = AudioSegment.from_file(bg_path)
59
+ bg = bg - 20 # Reduce background music volume by 20 dB
60
+
61
+ # Ensure the background music is at least as long as the modified voice
62
+ if len(bg) < len(modified_voice):
63
+ bg = bg * (len(modified_voice) // len(bg) + 1)
64
+ bg = bg[:len(modified_voice)]
65
+
66
+ # Overlay the background music
67
+ final_audio = modified_voice.overlay(bg)
68
+
69
+ # Save the final audio with background music
70
+ final_file = os.path.join(wav_directory, "temp_final.wav")
71
+ final_audio.export(final_file, format="wav")
72
+ return final_file
73
+
74
+ return temp_file
75
+ return None
76
+
77
+ # Function to evaluate the original audio and return the appropriate response file and message
78
+ def evaluate_audio(selected_file):
79
+ if selected_file:
80
+ # Transcribe the selected .mp3 file using Whisper
81
+ file_path = os.path.join(wav_directory, selected_file)
82
+ result = whisper_model.transcribe(file_path)
83
+ english_text = result["text"]
84
+
85
+ # Create a prompt to evaluate if the text is meaningful English
86
+ prompt = f'''
87
+ Text: {english_text}
88
+
89
+ Instructions:
90
+ 1. Read the text above.
91
+ 2. If the text is not in meaningful English, write only 'no'.
92
+ 3. If the text is in meaningful English, write only 'yes'.
93
+ 4. Do not write anything else except 'no' or 'yes'.
94
+ '''
95
+
96
+ # Use Ollama to evaluate the text
97
+ response = ollama.chat(model='phi3', messages=[{'role': 'user', 'content': prompt}]).message.content
98
+
99
+ print(response)
100
+
101
+ # Determine which response file to play and the appropriate message
102
+ if "yes" in response.lower():
103
+ message = "Your inquiry is in the English language."
104
+ response_file = os.path.join(response_directory, "res.mp3")
105
+ else:
106
+ message = "Your inquiry is in Piglatin language."
107
+ response_file = os.path.join(response_directory, "nres.mp3")
108
+
109
+ return response_file, message
110
+ return None, ""
111
+
112
+ # Create the Gradio interface
113
+ with gr.Blocks() as demo:
114
+ # Dropdown to list available .wav files
115
+ available_musics = gr.Dropdown(choices=list_wav_files(), label="Available Voices")
116
+
117
+ # Dropdown to select playback speed
118
+ speed_options = [1, 1.25, 1.5, 1.75, 2]
119
+ speed_selector = gr.Dropdown(choices=speed_options, label="Select Playback Speed", value=1)
120
+
121
+ # Dropdown to select pitch shift
122
+ pitch_options = [0, 1, 2, 3, 4, 5]
123
+ pitch_selector = gr.Dropdown(choices=pitch_options, label="Select Pitch Shift", value=0)
124
+
125
+ # Dropdown to select background music
126
+ bg_music_selector = gr.Dropdown(choices=list_bg_files(), label="Select Background Sound")
127
+
128
+ # Audio component to play the selected .wav file
129
+ audio_player = gr.Audio(label="")
130
+
131
+ # Link the dropdowns to the audio player
132
+ available_musics.change(fn=adjust_audio, inputs=[available_musics, speed_selector, pitch_selector, bg_music_selector], outputs=audio_player)
133
+ speed_selector.change(fn=adjust_audio, inputs=[available_musics, speed_selector, pitch_selector, bg_music_selector], outputs=audio_player)
134
+ pitch_selector.change(fn=adjust_audio, inputs=[available_musics, speed_selector, pitch_selector, bg_music_selector], outputs=audio_player)
135
+ bg_music_selector.change(fn=adjust_audio, inputs=[available_musics, speed_selector, pitch_selector, bg_music_selector], outputs=audio_player)
136
+
137
+ # New section for AI-generated response
138
+ with gr.Group():
139
+ gr.Markdown("### AI Generated Response By Voice Agent")
140
+ ai_response_audio = gr.Audio(label="AI Response")
141
+ ai_response_message = gr.Markdown("") # Placeholder for the message
142
+
143
+ available_musics.change(fn=evaluate_audio, inputs=available_musics, outputs=[ai_response_audio, ai_response_message])
144
+
145
+ with gr.Group():
146
+ gr.Markdown("### Project Explanation")
147
+ gr.Markdown("""
148
+ As per the assignment requirements, I have developed a web interface that allows users to customize voice settings, including:
149
+
150
+ - **Voice Selection (choose from available voices)
151
+ - **Speed Modification (adjust speaking speed)
152
+ - **Pitch Adjustment (alter voice pitch)
153
+ - **Background Sound Addition (enhance the audio with background effects)
154
+ - **Handling Pig Latin Language Input
155
+ The assignment did not explicitly specify how to handle user speech in Pig Latin. Based on my assumption:
156
+
157
+ - **If the user's speech is in Pig Latin, the voice assistant will respond in Pig Latin with:
158
+ "Thank you, I received your query. I will get back to you soon."
159
+ - **If the user's speech is in English, the voice assistant will reply in Pig Latin:
160
+ "Sorry, please speak in Pig Latin. I cannot understand otherwise."
161
+ Technology Stack
162
+ The project utilizes the following technologies:
163
+
164
+ Audio Processing: librosa, pydub, soundfile
165
+ Speech-to-Text Model: Whisper (for transcribing speech)
166
+ LLM Model: PH3 (for generating AI responses)
167
+ Optimized for Efficiency: Due to computational constraints, I used lightweight deep learning and NLP models. However, these can be upgraded for improved performance.
168
+ How to Use the Voice Assistant
169
+ - **Select a voice from the dropdown menu.
170
+ - **Adjust the speed of the voice.
171
+ - **Modify the pitch as desired.
172
+ - **Add background sound if needed.
173
+ - **Listen to the modified voice output.
174
+ - **Receive an AI-generated response from the voice assistant.
175
+
176
+ """)
177
+
178
+ # Launch the Gradio interface
179
+ demo.launch(share=True)