RealTime / app.py
VanguardAI's picture
Create app.py
14cda64 verified
raw
history blame
5.35 kB
import torch
import torchaudio
import gradio as gr
import pyaudio
import wave
import numpy as np
from transformers import WhisperForCTC, WhisperProcessor, AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import OpenVoiceV2Processor, OpenVoiceV2
# Load ASR model and processor
processor_asr = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
model_asr = WhisperForCTC.from_pretrained("openai/whisper-large-v3")
# Load text-to-text model and tokenizer
text_model = AutoModelForSeq2SeqLM.from_pretrained("meta-llama/Meta-Llama-3-8B")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
# Load TTS model
tts_processor = OpenVoiceV2Processor.from_pretrained("myshell-ai/OpenVoiceV2")
tts_model = OpenVoiceV2.from_pretrained("myshell-ai/OpenVoiceV2")
@spaces.GPU()
# ASR function
def transcribe(audio):
waveform, sample_rate = torchaudio.load(audio)
inputs = processor_asr(waveform, sampling_rate=sample_rate, return_tensors="pt", padding=True)
with torch.no_grad():
logits = model_asr(inputs.input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor_asr.batch_decode(predicted_ids)
return transcription[0]
@spaces.GPU(duration=300)
# Text-to-text function
def generate_response(text):
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
outputs = text_model.generate(**inputs)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
return response
@spaces.GPU(duration=300)
# TTS function
def synthesize_speech(text):
inputs = tts_processor(text, return_tensors="pt")
with torch.no_grad():
mel_outputs, mel_outputs_postnet, _, alignments = tts_model.inference(inputs.input_ids)
audio = tts_model.infer(mel_outputs_postnet)
return audio
@spaces.GPU(duration=300)
# Real-time processing function
def real_time_pipeline():
p = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=1024)
wake_word = "hello mate"
wake_word_detected = False
print("Listening for wake word...")
try:
while True:
frames = []
for _ in range(0, int(16000 / 1024 * 2)): # 2 seconds of audio
data = stream.read(1024)
frames.append(data)
audio_data = np.frombuffer(b''.join(frames), dtype=np.int16)
# Save the audio to a temporary file for ASR
wf = wave.open("temp.wav", 'wb')
wf.setnchannels(1)
wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
wf.setframerate(16000)
wf.writeframes(b''.join(frames))
wf.close()
# Step 1: Transcribe audio to text
transcription = transcribe("temp.wav").lower()
if wake_word in transcription:
wake_word_detected = True
print("Wake word detected. Processing audio...")
while wake_word_detected:
frames = []
for _ in range(0, int(16000 / 1024 * 2)): # 2 seconds of audio
data = stream.read(1024)
frames.append(data)
audio_data = np.frombuffer(b''.join(frames), dtype=np.int16)
# Save the audio to a temporary file for ASR
wf = wave.open("temp.wav", 'wb')
wf.setnchannels(1)
wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
wf.setframerate(16000)
wf.writeframes(b''.join(frames))
wf.close()
# Step 1: Transcribe audio to text
transcription = transcribe("temp.wav")
# Step 2: Generate response using text-to-text model
response = generate_response(transcription)
# Step 3: Synthesize speech from text
synthesized_audio = synthesize_speech(response)
# Save the synthesized audio to a temporary file
output_path = "output.wav"
torchaudio.save(output_path, synthesized_audio.squeeze(1), 22050)
# Play the synthesized audio
wf = wave.open(output_path, 'rb')
stream_out = p.open(format=p.get_format_from_width(wf.getsampwidth()),
channels=wf.getnchannels(),
rate=wf.getframerate(),
output=True)
data = wf.readframes(1024)
while data:
stream_out.write(data)
data = wf.readframes(1024)
stream_out.stop_stream()
stream_out.close()
wf.close()
except KeyboardInterrupt:
print("Stopping...")
finally:
stream.stop_stream()
stream.close()
p.terminate()
# Gradio interface
gr_interface = gr.Interface(
fn=real_time_pipeline,
inputs=None,
outputs=None,
live=True,
title="Real-Time Audio-to-Audio Model",
description="ASR + Text-to-Text Model + TTS with Human-like Voice and Emotions"
)
iface.launch(inline=False)