Spaces:
Build error
Build error
VanguardAI
commited on
Commit
•
f1f4016
1
Parent(s):
796ef02
Update app.py
Browse files
app.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
import torch
|
2 |
import torchaudio
|
3 |
import gradio as gr
|
4 |
-
import
|
5 |
import wave
|
6 |
import numpy as np
|
7 |
from transformers import WhisperForCTC, WhisperProcessor, AutoModelForSeq2SeqLM, AutoTokenizer
|
@@ -29,16 +29,16 @@ def transcribe(audio):
|
|
29 |
predicted_ids = torch.argmax(logits, dim=-1)
|
30 |
transcription = processor_asr.batch_decode(predicted_ids)
|
31 |
return transcription[0]
|
32 |
-
|
33 |
-
@spaces.GPU(
|
34 |
# Text-to-text function
|
35 |
def generate_response(text):
|
36 |
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
|
37 |
outputs = text_model.generate(**inputs)
|
38 |
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
39 |
return response
|
40 |
-
|
41 |
-
@spaces.GPU(
|
42 |
# TTS function
|
43 |
def synthesize_speech(text):
|
44 |
inputs = tts_processor(text, return_tensors="pt")
|
@@ -46,58 +46,49 @@ def synthesize_speech(text):
|
|
46 |
mel_outputs, mel_outputs_postnet, _, alignments = tts_model.inference(inputs.input_ids)
|
47 |
audio = tts_model.infer(mel_outputs_postnet)
|
48 |
return audio
|
49 |
-
|
50 |
-
@spaces.GPU(
|
51 |
# Real-time processing function
|
52 |
def real_time_pipeline():
|
53 |
-
|
54 |
-
|
|
|
|
|
55 |
|
56 |
wake_word = "hello mate"
|
57 |
wake_word_detected = False
|
58 |
|
59 |
print("Listening for wake word...")
|
60 |
|
|
|
|
|
|
|
61 |
try:
|
62 |
while True:
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
# Save the audio to a temporary file for ASR
|
70 |
-
wf = wave.open("temp.wav", 'wb')
|
71 |
-
wf.setnchannels(1)
|
72 |
-
wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
|
73 |
-
wf.setframerate(16000)
|
74 |
-
wf.writeframes(b''.join(frames))
|
75 |
-
wf.close()
|
76 |
|
77 |
# Step 1: Transcribe audio to text
|
78 |
-
transcription = transcribe(
|
79 |
|
80 |
if wake_word in transcription:
|
81 |
wake_word_detected = True
|
82 |
print("Wake word detected. Processing audio...")
|
83 |
|
84 |
while wake_word_detected:
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
# Save the audio to a temporary file for ASR
|
92 |
-
wf = wave.open("temp.wav", 'wb')
|
93 |
-
wf.setnchannels(1)
|
94 |
-
wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
|
95 |
-
wf.setframerate(16000)
|
96 |
-
wf.writeframes(b''.join(frames))
|
97 |
-
wf.close()
|
98 |
|
99 |
# Step 1: Transcribe audio to text
|
100 |
-
transcription = transcribe(
|
101 |
|
102 |
# Step 2: Generate response using text-to-text model
|
103 |
response = generate_response(transcription)
|
@@ -109,26 +100,12 @@ def real_time_pipeline():
|
|
109 |
output_path = "output.wav"
|
110 |
torchaudio.save(output_path, synthesized_audio.squeeze(1), 22050)
|
111 |
|
112 |
-
# Play the synthesized audio
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
rate=wf.getframerate(),
|
117 |
-
output=True)
|
118 |
-
|
119 |
-
data = wf.readframes(1024)
|
120 |
-
while data:
|
121 |
-
stream_out.write(data)
|
122 |
-
data = wf.readframes(1024)
|
123 |
-
stream_out.stop_stream()
|
124 |
-
stream_out.close()
|
125 |
-
wf.close()
|
126 |
except KeyboardInterrupt:
|
127 |
print("Stopping...")
|
128 |
-
finally:
|
129 |
-
stream.stop_stream()
|
130 |
-
stream.close()
|
131 |
-
p.terminate()
|
132 |
|
133 |
# Gradio interface
|
134 |
gr_interface = gr.Interface(
|
@@ -140,4 +117,5 @@ gr_interface = gr.Interface(
|
|
140 |
description="ASR + Text-to-Text Model + TTS with Human-like Voice and Emotions"
|
141 |
)
|
142 |
|
143 |
-
|
|
|
|
1 |
import torch
|
2 |
import torchaudio
|
3 |
import gradio as gr
|
4 |
+
import soundfile as sf
|
5 |
import wave
|
6 |
import numpy as np
|
7 |
from transformers import WhisperForCTC, WhisperProcessor, AutoModelForSeq2SeqLM, AutoTokenizer
|
|
|
29 |
predicted_ids = torch.argmax(logits, dim=-1)
|
30 |
transcription = processor_asr.batch_decode(predicted_ids)
|
31 |
return transcription[0]
|
32 |
+
|
33 |
+
@spaces.GPU()
|
34 |
# Text-to-text function
|
35 |
def generate_response(text):
|
36 |
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
|
37 |
outputs = text_model.generate(**inputs)
|
38 |
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
39 |
return response
|
40 |
+
|
41 |
+
@spaces.GPU()
|
42 |
# TTS function
|
43 |
def synthesize_speech(text):
|
44 |
inputs = tts_processor(text, return_tensors="pt")
|
|
|
46 |
mel_outputs, mel_outputs_postnet, _, alignments = tts_model.inference(inputs.input_ids)
|
47 |
audio = tts_model.infer(mel_outputs_postnet)
|
48 |
return audio
|
49 |
+
|
50 |
+
@spaces.GPU()
|
51 |
# Real-time processing function
|
52 |
def real_time_pipeline():
|
53 |
+
# Adjust this part to handle live recording using soundfile and play back using simpleaudio
|
54 |
+
import simpleaudio as sa
|
55 |
+
import tempfile
|
56 |
+
import time
|
57 |
|
58 |
wake_word = "hello mate"
|
59 |
wake_word_detected = False
|
60 |
|
61 |
print("Listening for wake word...")
|
62 |
|
63 |
+
with tempfile.NamedTemporaryFile(delete=False) as tmp_wav_file:
|
64 |
+
tmp_wav_path = tmp_wav_file.name
|
65 |
+
|
66 |
try:
|
67 |
while True:
|
68 |
+
# Capture audio here (this is a simplified example, you need actual audio capture logic)
|
69 |
+
time.sleep(2) # Simulate 2 seconds of audio capture
|
70 |
+
|
71 |
+
# Save the captured audio to the temp file for ASR
|
72 |
+
data, sample_rate = sf.read(tmp_wav_path)
|
73 |
+
sf.write(tmp_wav_path, data, sample_rate)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
|
75 |
# Step 1: Transcribe audio to text
|
76 |
+
transcription = transcribe(tmp_wav_path).lower()
|
77 |
|
78 |
if wake_word in transcription:
|
79 |
wake_word_detected = True
|
80 |
print("Wake word detected. Processing audio...")
|
81 |
|
82 |
while wake_word_detected:
|
83 |
+
# Capture audio here (this is a simplified example, you need actual audio capture logic)
|
84 |
+
time.sleep(2) # Simulate 2 seconds of audio capture
|
85 |
+
|
86 |
+
# Save the captured audio to the temp file for ASR
|
87 |
+
data, sample_rate = sf.read(tmp_wav_path)
|
88 |
+
sf.write(tmp_wav_path, data, sample_rate)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
|
90 |
# Step 1: Transcribe audio to text
|
91 |
+
transcription = transcribe(tmp_wav_path)
|
92 |
|
93 |
# Step 2: Generate response using text-to-text model
|
94 |
response = generate_response(transcription)
|
|
|
100 |
output_path = "output.wav"
|
101 |
torchaudio.save(output_path, synthesized_audio.squeeze(1), 22050)
|
102 |
|
103 |
+
# Play the synthesized audio using simpleaudio
|
104 |
+
wave_obj = sa.WaveObject.from_wave_file(output_path)
|
105 |
+
play_obj = wave_obj.play()
|
106 |
+
play_obj.wait_done()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
except KeyboardInterrupt:
|
108 |
print("Stopping...")
|
|
|
|
|
|
|
|
|
109 |
|
110 |
# Gradio interface
|
111 |
gr_interface = gr.Interface(
|
|
|
117 |
description="ASR + Text-to-Text Model + TTS with Human-like Voice and Emotions"
|
118 |
)
|
119 |
|
120 |
+
|
121 |
+
iface.launch(inline=False)
|