Spaces:

rame
/

cosmox_prj

Sleeping

App Files Files Community

cosmox_prj / app.py

rame

Update app.py

cc5b9c5 verified 5 months ago

raw

history blame contribute delete

8.11 kB

	#pip install openai-whisper
	#!apt-get install ffmpeg
	#!pip install playsound
	#!pip install pydub
	#!pip install librosa
	#!pip install gradio
	#!pip install ollama
	import gradio as gr
	import os
	import librosa
	import soundfile as sf
	import numpy as np
	from pydub import AudioSegment
	import whisper # For speech-to-text
	import ollama # For AI text evaluation


	# Start Ollama server in the background
	os.system("ollama serve &")

	# Define the directories where your .wav, background music, and response files are located
	wav_directory = "./files"
	bg_directory = "./files/bg"
	response_directory = "./files/response"

	# Load Whisper model for speech-to-text
	whisper_model = whisper.load_model("base")

	# Function to list all .wav files in the directory
	def list_wav_files():
	return [f for f in os.listdir(wav_directory) if f.endswith('.mp3')]

	# Function to list all background music files in the directory
	def list_bg_files():
	bg_files = [f for f in os.listdir(bg_directory) if f.endswith('.mp3')]
	bg_files.insert(0, "None") # Add "None" as the first option
	return bg_files

	# Function to adjust the speed and pitch of the selected .wav file and add background music
	def adjust_audio(selected_file, speed, pitch, bg_music):
	if selected_file:
	# Load the selected .wav file using librosa
	file_path = os.path.join(wav_directory, selected_file)
	y, sr = librosa.load(file_path, sr=None)

	# Adjust the speed using librosa's time_stretch function
	if speed != 1.0:
	y = librosa.effects.time_stretch(y=y, rate=speed)

	# Adjust the pitch using librosa's pitch_shift function
	if pitch != 0:
	y = librosa.effects.pitch_shift(y=y, sr=sr, n_steps=pitch)

	# Save the adjusted audio to a temporary file
	temp_file = os.path.join(wav_directory, "temp_adjusted.wav")
	sf.write(temp_file, y, sr)

	# If background music is selected and not "None", overlay it
	if bg_music and bg_music != "None":
	bg_path = os.path.join(bg_directory, bg_music)
	modified_voice = AudioSegment.from_file(temp_file)
	bg = AudioSegment.from_file(bg_path)
	bg = bg - 20 # Reduce background music volume by 20 dB

	# Ensure the background music is at least as long as the modified voice
	if len(bg) < len(modified_voice):
	bg = bg * (len(modified_voice) // len(bg) + 1)
	bg = bg[:len(modified_voice)]

	# Overlay the background music
	final_audio = modified_voice.overlay(bg)

	# Save the final audio with background music
	final_file = os.path.join(wav_directory, "temp_final.wav")
	final_audio.export(final_file, format="wav")
	return final_file

	return temp_file
	return None

	# Function to evaluate the original audio and return the appropriate response file and message
	def evaluate_audio(selected_file):
	if selected_file:
	# Transcribe the selected .mp3 file using Whisper
	file_path = os.path.join(wav_directory, selected_file)
	result = whisper_model.transcribe(file_path)
	english_text = result["text"]

	# Create a prompt to evaluate if the text is meaningful English
	prompt = f'''
	Text: {english_text}

	Instructions:
	1. Read the text above.
	2. If the text is not in meaningful English, write only 'no'.
	3. If the text is in meaningful English, write only 'yes'.
	4. Do not write anything else except 'no' or 'yes'.
	'''

	# Use Ollama to evaluate the text
	response = ollama.chat(model='phi3', messages=[{'role': 'user', 'content': prompt}]).message.content

	print(response)

	# Determine which response file to play and the appropriate message
	if "yes" in response.lower():
	message = "Your inquiry is in the English language."
	response_file = os.path.join(response_directory, "res.mp3")
	else:
	message = "Your inquiry is in Piglatin language."
	response_file = os.path.join(response_directory, "nres.mp3")

	return response_file, message
	return None, ""

	# Create the Gradio interface
	with gr.Blocks() as demo:
	# Dropdown to list available .wav files
	available_musics = gr.Dropdown(choices=list_wav_files(), label="Available Voices")

	# Dropdown to select playback speed
	speed_options = [1, 1.25, 1.5, 1.75, 2]
	speed_selector = gr.Dropdown(choices=speed_options, label="Select Playback Speed", value=1)

	# Dropdown to select pitch shift
	pitch_options = [0, 1, 2, 3, 4, 5]
	pitch_selector = gr.Dropdown(choices=pitch_options, label="Select Pitch Shift", value=0)

	# Dropdown to select background music
	bg_music_selector = gr.Dropdown(choices=list_bg_files(), label="Select Background Sound")

	# Audio component to play the selected .wav file
	audio_player = gr.Audio(label="")

	# Link the dropdowns to the audio player
	available_musics.change(fn=adjust_audio, inputs=[available_musics, speed_selector, pitch_selector, bg_music_selector], outputs=audio_player)
	speed_selector.change(fn=adjust_audio, inputs=[available_musics, speed_selector, pitch_selector, bg_music_selector], outputs=audio_player)
	pitch_selector.change(fn=adjust_audio, inputs=[available_musics, speed_selector, pitch_selector, bg_music_selector], outputs=audio_player)
	bg_music_selector.change(fn=adjust_audio, inputs=[available_musics, speed_selector, pitch_selector, bg_music_selector], outputs=audio_player)

	# New section for AI-generated response
	with gr.Group():
	gr.Markdown("### AI Generated Response By Voice Agent")
	ai_response_audio = gr.Audio(label="AI Response")
	ai_response_message = gr.Markdown("") # Placeholder for the message

	available_musics.change(fn=evaluate_audio, inputs=available_musics, outputs=[ai_response_audio, ai_response_message])

	with gr.Group():
	gr.Markdown("### Project Explanation")
	gr.Markdown("""
	As per the assignment requirements, I have developed a web interface that allows users to customize voice settings, including:

	- **Voice Selection (choose from available voices)
	- **Speed Modification (adjust speaking speed)
	- **Pitch Adjustment (alter voice pitch)
	- **Background Sound Addition (enhance the audio with background effects)
	- **Handling Pig Latin Language Input
	The assignment did not explicitly specify how to handle user speech in Pig Latin. Based on my assumption:

	- **If the user's speech is in Pig Latin, the voice assistant will respond in Pig Latin with:
	"Thank you, I received your query. I will get back to you soon."
	- **If the user's speech is in English, the voice assistant will reply in Pig Latin:
	"Sorry, please speak in Pig Latin. I cannot understand otherwise."
	Technology Stack
	The project utilizes the following technologies:

	Audio Processing: librosa, pydub, soundfile
	Speech-to-Text Model: Whisper (for transcribing speech)
	LLM Model: PH3 (for generating AI responses)
	Optimized for Efficiency: Due to computational constraints, I used lightweight deep learning and NLP models. However, these can be upgraded for improved performance.
	How to Use the Voice Assistant
	- **Select a voice from the dropdown menu.
	- **Adjust the speed of the voice.
	- **Modify the pitch as desired.
	- **Add background sound if needed.
	- **Listen to the modified voice output.
	- **Receive an AI-generated response from the voice assistant.

	""")

	# Launch the Gradio interface
	demo.launch(share=True)