File size: 8,109 Bytes
d93856b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc5b9c5
 
 
 
d93856b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
#pip install openai-whisper
#!apt-get install ffmpeg
#!pip install playsound
#!pip install pydub
#!pip install librosa
#!pip install gradio
#!pip install ollama
import gradio as gr
import os
import librosa
import soundfile as sf
import numpy as np
from pydub import AudioSegment
import whisper  # For speech-to-text
import ollama  # For AI text evaluation


# Start Ollama server in the background
os.system("ollama serve &")

# Define the directories where your .wav, background music, and response files are located
wav_directory = "./files"
bg_directory = "./files/bg"
response_directory = "./files/response"

# Load Whisper model for speech-to-text
whisper_model = whisper.load_model("base")

# Function to list all .wav files in the directory
def list_wav_files():
    return [f for f in os.listdir(wav_directory) if f.endswith('.mp3')]

# Function to list all background music files in the directory
def list_bg_files():
    bg_files = [f for f in os.listdir(bg_directory) if f.endswith('.mp3')]
    bg_files.insert(0, "None")  # Add "None" as the first option
    return bg_files

# Function to adjust the speed and pitch of the selected .wav file and add background music
def adjust_audio(selected_file, speed, pitch, bg_music):
    if selected_file:
        # Load the selected .wav file using librosa
        file_path = os.path.join(wav_directory, selected_file)
        y, sr = librosa.load(file_path, sr=None)
        
        # Adjust the speed using librosa's time_stretch function
        if speed != 1.0:
            y = librosa.effects.time_stretch(y=y, rate=speed)
        
        # Adjust the pitch using librosa's pitch_shift function
        if pitch != 0:
            y = librosa.effects.pitch_shift(y=y, sr=sr, n_steps=pitch)
        
        # Save the adjusted audio to a temporary file
        temp_file = os.path.join(wav_directory, "temp_adjusted.wav")
        sf.write(temp_file, y, sr)
        
        # If background music is selected and not "None", overlay it
        if bg_music and bg_music != "None":
            bg_path = os.path.join(bg_directory, bg_music)
            modified_voice = AudioSegment.from_file(temp_file)
            bg = AudioSegment.from_file(bg_path)
            bg = bg - 20  # Reduce background music volume by 20 dB
            
            # Ensure the background music is at least as long as the modified voice
            if len(bg) < len(modified_voice):
                bg = bg * (len(modified_voice) // len(bg) + 1)
            bg = bg[:len(modified_voice)]
            
            # Overlay the background music
            final_audio = modified_voice.overlay(bg)
            
            # Save the final audio with background music
            final_file = os.path.join(wav_directory, "temp_final.wav")
            final_audio.export(final_file, format="wav")
            return final_file
        
        return temp_file
    return None

# Function to evaluate the original audio and return the appropriate response file and message
def evaluate_audio(selected_file):
    if selected_file:
        # Transcribe the selected .mp3 file using Whisper
        file_path = os.path.join(wav_directory, selected_file)
        result = whisper_model.transcribe(file_path)
        english_text = result["text"]

        # Create a prompt to evaluate if the text is meaningful English
        prompt = f'''
        Text: {english_text}
        
        Instructions:
        1. Read the text above.
        2. If the text is not in meaningful English, write only 'no'.
        3. If the text is in meaningful English, write only 'yes'.
        4. Do not write anything else except 'no' or 'yes'.
        '''
        
        # Use Ollama to evaluate the text
        response = ollama.chat(model='phi3', messages=[{'role': 'user', 'content': prompt}]).message.content

        print(response)
        
        # Determine which response file to play and the appropriate message
        if "yes" in response.lower():
            message = "Your inquiry is in the English language."
            response_file = os.path.join(response_directory, "res.mp3")
        else:
            message = "Your inquiry is in Piglatin language."
            response_file = os.path.join(response_directory, "nres.mp3")
        
        return response_file, message
    return None, ""

# Create the Gradio interface
with gr.Blocks() as demo:
    # Dropdown to list available .wav files
    available_musics = gr.Dropdown(choices=list_wav_files(), label="Available Voices")
    
    # Dropdown to select playback speed
    speed_options = [1, 1.25, 1.5, 1.75, 2]
    speed_selector = gr.Dropdown(choices=speed_options, label="Select Playback Speed", value=1)
    
    # Dropdown to select pitch shift
    pitch_options = [0, 1, 2, 3, 4, 5]
    pitch_selector = gr.Dropdown(choices=pitch_options, label="Select Pitch Shift", value=0)
    
    # Dropdown to select background music
    bg_music_selector = gr.Dropdown(choices=list_bg_files(), label="Select Background Sound")
    
    # Audio component to play the selected .wav file
    audio_player = gr.Audio(label="")
    
    # Link the dropdowns to the audio player
    available_musics.change(fn=adjust_audio, inputs=[available_musics, speed_selector, pitch_selector, bg_music_selector], outputs=audio_player)
    speed_selector.change(fn=adjust_audio, inputs=[available_musics, speed_selector, pitch_selector, bg_music_selector], outputs=audio_player)
    pitch_selector.change(fn=adjust_audio, inputs=[available_musics, speed_selector, pitch_selector, bg_music_selector], outputs=audio_player)
    bg_music_selector.change(fn=adjust_audio, inputs=[available_musics, speed_selector, pitch_selector, bg_music_selector], outputs=audio_player)
    
    # New section for AI-generated response
    with gr.Group():
        gr.Markdown("### AI Generated Response By Voice Agent")
        ai_response_audio = gr.Audio(label="AI Response")
        ai_response_message = gr.Markdown("")  # Placeholder for the message
        
        available_musics.change(fn=evaluate_audio, inputs=available_musics, outputs=[ai_response_audio, ai_response_message])

    with gr.Group():
            gr.Markdown("### Project Explanation")
            gr.Markdown("""
            As per the assignment requirements, I have developed a web interface that allows users to customize voice settings, including:
            
            - **Voice Selection (choose from available voices)
            - **Speed Modification (adjust speaking speed)
            - **Pitch Adjustment (alter voice pitch)
            - **Background Sound Addition (enhance the audio with background effects)
            - **Handling Pig Latin Language Input
            The assignment did not explicitly specify how to handle user speech in Pig Latin. Based on my assumption:
            
            - **If the user's speech is in Pig Latin, the voice assistant will respond in Pig Latin with:
            "Thank you, I received your query. I will get back to you soon."
            - **If the user's speech is in English, the voice assistant will reply in Pig Latin:
            "Sorry, please speak in Pig Latin. I cannot understand otherwise."
            Technology Stack
            The project utilizes the following technologies:
            
            Audio Processing: librosa, pydub, soundfile
            Speech-to-Text Model: Whisper (for transcribing speech)
            LLM Model: PH3 (for generating AI responses)
            Optimized for Efficiency: Due to computational constraints, I used lightweight deep learning and NLP models. However, these can be upgraded for improved performance.
            How to Use the Voice Assistant
            - **Select a voice from the dropdown menu.
            - **Adjust the speed of the voice.
            - **Modify the pitch as desired.
            - **Add background sound if needed.
            - **Listen to the modified voice output.
            - **Receive an AI-generated response from the voice assistant.
    
            """)

# Launch the Gradio interface
demo.launch(share=True)