RealTime

Build error

App Files Files Community

RealTime / app.py

VanguardAI

Create app.py

14cda64 verified 4 months ago

raw

history blame

5.35 kB

	import torch
	import torchaudio
	import gradio as gr
	import pyaudio
	import wave
	import numpy as np
	from transformers import WhisperForCTC, WhisperProcessor, AutoModelForSeq2SeqLM, AutoTokenizer
	from transformers import OpenVoiceV2Processor, OpenVoiceV2

	# Load ASR model and processor
	processor_asr = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
	model_asr = WhisperForCTC.from_pretrained("openai/whisper-large-v3")

	# Load text-to-text model and tokenizer
	text_model = AutoModelForSeq2SeqLM.from_pretrained("meta-llama/Meta-Llama-3-8B")
	tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")

	# Load TTS model
	tts_processor = OpenVoiceV2Processor.from_pretrained("myshell-ai/OpenVoiceV2")
	tts_model = OpenVoiceV2.from_pretrained("myshell-ai/OpenVoiceV2")

	@spaces.GPU()
	# ASR function
	def transcribe(audio):
	waveform, sample_rate = torchaudio.load(audio)
	inputs = processor_asr(waveform, sampling_rate=sample_rate, return_tensors="pt", padding=True)
	with torch.no_grad():
	logits = model_asr(inputs.input_values).logits
	predicted_ids = torch.argmax(logits, dim=-1)
	transcription = processor_asr.batch_decode(predicted_ids)
	return transcription[0]

	@spaces.GPU(duration=300)
	# Text-to-text function
	def generate_response(text):
	inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
	outputs = text_model.generate(**inputs)
	response = tokenizer.decode(outputs[0], skip_special_tokens=True)
	return response

	@spaces.GPU(duration=300)
	# TTS function
	def synthesize_speech(text):
	inputs = tts_processor(text, return_tensors="pt")
	with torch.no_grad():
	mel_outputs, mel_outputs_postnet, _, alignments = tts_model.inference(inputs.input_ids)
	audio = tts_model.infer(mel_outputs_postnet)
	return audio

	@spaces.GPU(duration=300)
	# Real-time processing function
	def real_time_pipeline():
	p = pyaudio.PyAudio()
	stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=1024)

	wake_word = "hello mate"
	wake_word_detected = False

	print("Listening for wake word...")

	try:
	while True:
	frames = []
	for _ in range(0, int(16000 / 1024 * 2)): # 2 seconds of audio
	data = stream.read(1024)
	frames.append(data)
	audio_data = np.frombuffer(b''.join(frames), dtype=np.int16)

	# Save the audio to a temporary file for ASR
	wf = wave.open("temp.wav", 'wb')
	wf.setnchannels(1)
	wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
	wf.setframerate(16000)
	wf.writeframes(b''.join(frames))
	wf.close()

	# Step 1: Transcribe audio to text
	transcription = transcribe("temp.wav").lower()

	if wake_word in transcription:
	wake_word_detected = True
	print("Wake word detected. Processing audio...")

	while wake_word_detected:
	frames = []
	for _ in range(0, int(16000 / 1024 * 2)): # 2 seconds of audio
	data = stream.read(1024)
	frames.append(data)
	audio_data = np.frombuffer(b''.join(frames), dtype=np.int16)

	# Save the audio to a temporary file for ASR
	wf = wave.open("temp.wav", 'wb')
	wf.setnchannels(1)
	wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
	wf.setframerate(16000)
	wf.writeframes(b''.join(frames))
	wf.close()

	# Step 1: Transcribe audio to text
	transcription = transcribe("temp.wav")

	# Step 2: Generate response using text-to-text model
	response = generate_response(transcription)

	# Step 3: Synthesize speech from text
	synthesized_audio = synthesize_speech(response)

	# Save the synthesized audio to a temporary file
	output_path = "output.wav"
	torchaudio.save(output_path, synthesized_audio.squeeze(1), 22050)

	# Play the synthesized audio
	wf = wave.open(output_path, 'rb')
	stream_out = p.open(format=p.get_format_from_width(wf.getsampwidth()),
	channels=wf.getnchannels(),
	rate=wf.getframerate(),
	output=True)

	data = wf.readframes(1024)
	while data:
	stream_out.write(data)
	data = wf.readframes(1024)
	stream_out.stop_stream()
	stream_out.close()
	wf.close()
	except KeyboardInterrupt:
	print("Stopping...")
	finally:
	stream.stop_stream()
	stream.close()
	p.terminate()

	# Gradio interface
	gr_interface = gr.Interface(
	fn=real_time_pipeline,
	inputs=None,
	outputs=None,
	live=True,
	title="Real-Time Audio-to-Audio Model",
	description="ASR + Text-to-Text Model + TTS with Human-like Voice and Emotions"
	)

	iface.launch(inline=False)