VanguardAI commited on
Commit
14cda64
1 Parent(s): ee7063c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +143 -0
app.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torchaudio
3
+ import gradio as gr
4
+ import pyaudio
5
+ import wave
6
+ import numpy as np
7
+ from transformers import WhisperForCTC, WhisperProcessor, AutoModelForSeq2SeqLM, AutoTokenizer
8
+ from transformers import OpenVoiceV2Processor, OpenVoiceV2
9
+
10
+ # Load ASR model and processor
11
+ processor_asr = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
12
+ model_asr = WhisperForCTC.from_pretrained("openai/whisper-large-v3")
13
+
14
+ # Load text-to-text model and tokenizer
15
+ text_model = AutoModelForSeq2SeqLM.from_pretrained("meta-llama/Meta-Llama-3-8B")
16
+ tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
17
+
18
+ # Load TTS model
19
+ tts_processor = OpenVoiceV2Processor.from_pretrained("myshell-ai/OpenVoiceV2")
20
+ tts_model = OpenVoiceV2.from_pretrained("myshell-ai/OpenVoiceV2")
21
+
22
+ @spaces.GPU()
23
+ # ASR function
24
+ def transcribe(audio):
25
+ waveform, sample_rate = torchaudio.load(audio)
26
+ inputs = processor_asr(waveform, sampling_rate=sample_rate, return_tensors="pt", padding=True)
27
+ with torch.no_grad():
28
+ logits = model_asr(inputs.input_values).logits
29
+ predicted_ids = torch.argmax(logits, dim=-1)
30
+ transcription = processor_asr.batch_decode(predicted_ids)
31
+ return transcription[0]
32
+
33
+ @spaces.GPU(duration=300)
34
+ # Text-to-text function
35
+ def generate_response(text):
36
+ inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
37
+ outputs = text_model.generate(**inputs)
38
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
39
+ return response
40
+
41
+ @spaces.GPU(duration=300)
42
+ # TTS function
43
+ def synthesize_speech(text):
44
+ inputs = tts_processor(text, return_tensors="pt")
45
+ with torch.no_grad():
46
+ mel_outputs, mel_outputs_postnet, _, alignments = tts_model.inference(inputs.input_ids)
47
+ audio = tts_model.infer(mel_outputs_postnet)
48
+ return audio
49
+
50
+ @spaces.GPU(duration=300)
51
+ # Real-time processing function
52
+ def real_time_pipeline():
53
+ p = pyaudio.PyAudio()
54
+ stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=1024)
55
+
56
+ wake_word = "hello mate"
57
+ wake_word_detected = False
58
+
59
+ print("Listening for wake word...")
60
+
61
+ try:
62
+ while True:
63
+ frames = []
64
+ for _ in range(0, int(16000 / 1024 * 2)): # 2 seconds of audio
65
+ data = stream.read(1024)
66
+ frames.append(data)
67
+ audio_data = np.frombuffer(b''.join(frames), dtype=np.int16)
68
+
69
+ # Save the audio to a temporary file for ASR
70
+ wf = wave.open("temp.wav", 'wb')
71
+ wf.setnchannels(1)
72
+ wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
73
+ wf.setframerate(16000)
74
+ wf.writeframes(b''.join(frames))
75
+ wf.close()
76
+
77
+ # Step 1: Transcribe audio to text
78
+ transcription = transcribe("temp.wav").lower()
79
+
80
+ if wake_word in transcription:
81
+ wake_word_detected = True
82
+ print("Wake word detected. Processing audio...")
83
+
84
+ while wake_word_detected:
85
+ frames = []
86
+ for _ in range(0, int(16000 / 1024 * 2)): # 2 seconds of audio
87
+ data = stream.read(1024)
88
+ frames.append(data)
89
+ audio_data = np.frombuffer(b''.join(frames), dtype=np.int16)
90
+
91
+ # Save the audio to a temporary file for ASR
92
+ wf = wave.open("temp.wav", 'wb')
93
+ wf.setnchannels(1)
94
+ wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
95
+ wf.setframerate(16000)
96
+ wf.writeframes(b''.join(frames))
97
+ wf.close()
98
+
99
+ # Step 1: Transcribe audio to text
100
+ transcription = transcribe("temp.wav")
101
+
102
+ # Step 2: Generate response using text-to-text model
103
+ response = generate_response(transcription)
104
+
105
+ # Step 3: Synthesize speech from text
106
+ synthesized_audio = synthesize_speech(response)
107
+
108
+ # Save the synthesized audio to a temporary file
109
+ output_path = "output.wav"
110
+ torchaudio.save(output_path, synthesized_audio.squeeze(1), 22050)
111
+
112
+ # Play the synthesized audio
113
+ wf = wave.open(output_path, 'rb')
114
+ stream_out = p.open(format=p.get_format_from_width(wf.getsampwidth()),
115
+ channels=wf.getnchannels(),
116
+ rate=wf.getframerate(),
117
+ output=True)
118
+
119
+ data = wf.readframes(1024)
120
+ while data:
121
+ stream_out.write(data)
122
+ data = wf.readframes(1024)
123
+ stream_out.stop_stream()
124
+ stream_out.close()
125
+ wf.close()
126
+ except KeyboardInterrupt:
127
+ print("Stopping...")
128
+ finally:
129
+ stream.stop_stream()
130
+ stream.close()
131
+ p.terminate()
132
+
133
+ # Gradio interface
134
+ gr_interface = gr.Interface(
135
+ fn=real_time_pipeline,
136
+ inputs=None,
137
+ outputs=None,
138
+ live=True,
139
+ title="Real-Time Audio-to-Audio Model",
140
+ description="ASR + Text-to-Text Model + TTS with Human-like Voice and Emotions"
141
+ )
142
+
143
+ iface.launch(inline=False)