Spaces:

ar08
/

Voice-assitant

Sleeping

App Files Files Community

Voice-assitant / app.py

ar08

Update app.py

03c0141 verified 9 months ago

raw

history blame

4.42 kB

	import gradio as gr
	import asyncio
	import edge_tts
	import os
	from huggingface_hub import InferenceClient
	import whisper
	import torch
	import tempfile


	# Get the Hugging Face token from environment variable
	hf_token = os.getenv("HF_TOKEN")
	if not hf_token:
	raise ValueError("HF_TOKEN environment variable is not set")

	# Initialize the Hugging Face Inference Client
	client = InferenceClient("mistralai/Mistral-Nemo-Instruct-2407", token=hf_token)

	# Load the Whisper model
	whisper_model = whisper.load_model("tiny.en", device='cuda' if torch.cuda.is_available() else 'cpu')

	# Initialize an empty chat history
	chat_history = []

	async def text_to_speech_stream(text):
	"""Convert text to speech using edge_tts and return the audio file path."""
	communicate = edge_tts.Communicate(text, "en-US-AvaMultilingualNeural")
	audio_data = b""

	async for chunk in communicate.stream():
	if chunk["type"] == "audio":
	audio_data += chunk["data"]

	with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_file:
	temp_file.write(audio_data)
	return temp_file.name

	def whisper_speech_to_text(audio):
	"""Convert speech to text using Whisper model."""
	try:
	result = whisper_model.transcribe(audio)
	return result['text']
	except Exception as e:
	print(f"Whisper Error: {e}")
	return None
	finally:
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	async def chat_with_ai(message):
	global chat_history

	chat_history.append({"role": "user", "content": message})

	try:
	response = client.chat_completion(
	messages=[{"role": "system", "content": "You are a helpful voice assistant. Provide concise and clear responses to user queries."}] + chat_history,
	max_tokens=800,
	temperature=0.7
	)

	response_text = response.choices[0].message['content']
	chat_history.append({"role": "assistant", "content": response_text})

	audio_path = await text_to_speech_stream(response_text)

	return response_text, audio_path
	except Exception as e:
	print(f"Error: {e}")
	return str(e), None

	def transcribe_and_chat(audio):
	text = whisper_speech_to_text(audio)
	if text is None:
	return "Sorry, I couldn't understand the audio.", None

	response, audio_path = asyncio.run(chat_with_ai(text))
	return response, audio_path

	def create_demo():
	with gr.Blocks() as demo:
	gr.Markdown("# AI Voice Assistant")

	with gr.Row():
	with gr.Column(scale=1):
	audio_input = gr.Audio(type="filepath", label="Press 'Record' to Speak")

	with gr.Column(scale=1):
	chat_output = gr.Textbox(label="AI Response")
	audio_output = gr.Audio(label="AI Voice Response", autoplay=True)

	def process_audio(audio):
	response, audio_path = transcribe_and_chat(audio)
	return response, audio_path, None # Return None to clear the audio input

	demo.load(None, js="""
	function() {
	document.querySelector("audio").addEventListener("stop", function() {
	setTimeout(function() {
	document.querySelector('button[title="Submit"]').click();
	}, 500);
	});

	function playAssistantAudio() {
	var audioElements = document.querySelectorAll('audio');
	if (audioElements.length > 1) {
	var assistantAudio = audioElements[1];
	if (assistantAudio) {
	assistantAudio.play();
	}
	}
	}

	document.addEventListener('gradioAudioLoaded', function(event) {
	playAssistantAudio();
	});

	document.addEventListener('gradioUpdated', function(event) {
	setTimeout(playAssistantAudio, 100);
	});
	}
	""")

	audio_input.change(process_audio, inputs=[audio_input], outputs=[chat_output, audio_output, audio_input])

	return demo

	# Launch the Gradio app
	if __name__ == "__main__":
	demo = create_demo()
	demo.launch(server_name="0.0.0.0", server_port=7860)