VanguardAI commited on
Commit
f1f4016
1 Parent(s): 796ef02

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -56
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import torch
2
  import torchaudio
3
  import gradio as gr
4
- import pyaudio
5
  import wave
6
  import numpy as np
7
  from transformers import WhisperForCTC, WhisperProcessor, AutoModelForSeq2SeqLM, AutoTokenizer
@@ -29,16 +29,16 @@ def transcribe(audio):
29
  predicted_ids = torch.argmax(logits, dim=-1)
30
  transcription = processor_asr.batch_decode(predicted_ids)
31
  return transcription[0]
32
-
33
- @spaces.GPU(duration=300)
34
  # Text-to-text function
35
  def generate_response(text):
36
  inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
37
  outputs = text_model.generate(**inputs)
38
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
39
  return response
40
-
41
- @spaces.GPU(duration=300)
42
  # TTS function
43
  def synthesize_speech(text):
44
  inputs = tts_processor(text, return_tensors="pt")
@@ -46,58 +46,49 @@ def synthesize_speech(text):
46
  mel_outputs, mel_outputs_postnet, _, alignments = tts_model.inference(inputs.input_ids)
47
  audio = tts_model.infer(mel_outputs_postnet)
48
  return audio
49
-
50
- @spaces.GPU(duration=300)
51
  # Real-time processing function
52
  def real_time_pipeline():
53
- p = pyaudio.PyAudio()
54
- stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=1024)
 
 
55
 
56
  wake_word = "hello mate"
57
  wake_word_detected = False
58
 
59
  print("Listening for wake word...")
60
 
 
 
 
61
  try:
62
  while True:
63
- frames = []
64
- for _ in range(0, int(16000 / 1024 * 2)): # 2 seconds of audio
65
- data = stream.read(1024)
66
- frames.append(data)
67
- audio_data = np.frombuffer(b''.join(frames), dtype=np.int16)
68
-
69
- # Save the audio to a temporary file for ASR
70
- wf = wave.open("temp.wav", 'wb')
71
- wf.setnchannels(1)
72
- wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
73
- wf.setframerate(16000)
74
- wf.writeframes(b''.join(frames))
75
- wf.close()
76
 
77
  # Step 1: Transcribe audio to text
78
- transcription = transcribe("temp.wav").lower()
79
 
80
  if wake_word in transcription:
81
  wake_word_detected = True
82
  print("Wake word detected. Processing audio...")
83
 
84
  while wake_word_detected:
85
- frames = []
86
- for _ in range(0, int(16000 / 1024 * 2)): # 2 seconds of audio
87
- data = stream.read(1024)
88
- frames.append(data)
89
- audio_data = np.frombuffer(b''.join(frames), dtype=np.int16)
90
-
91
- # Save the audio to a temporary file for ASR
92
- wf = wave.open("temp.wav", 'wb')
93
- wf.setnchannels(1)
94
- wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
95
- wf.setframerate(16000)
96
- wf.writeframes(b''.join(frames))
97
- wf.close()
98
 
99
  # Step 1: Transcribe audio to text
100
- transcription = transcribe("temp.wav")
101
 
102
  # Step 2: Generate response using text-to-text model
103
  response = generate_response(transcription)
@@ -109,26 +100,12 @@ def real_time_pipeline():
109
  output_path = "output.wav"
110
  torchaudio.save(output_path, synthesized_audio.squeeze(1), 22050)
111
 
112
- # Play the synthesized audio
113
- wf = wave.open(output_path, 'rb')
114
- stream_out = p.open(format=p.get_format_from_width(wf.getsampwidth()),
115
- channels=wf.getnchannels(),
116
- rate=wf.getframerate(),
117
- output=True)
118
-
119
- data = wf.readframes(1024)
120
- while data:
121
- stream_out.write(data)
122
- data = wf.readframes(1024)
123
- stream_out.stop_stream()
124
- stream_out.close()
125
- wf.close()
126
  except KeyboardInterrupt:
127
  print("Stopping...")
128
- finally:
129
- stream.stop_stream()
130
- stream.close()
131
- p.terminate()
132
 
133
  # Gradio interface
134
  gr_interface = gr.Interface(
@@ -140,4 +117,5 @@ gr_interface = gr.Interface(
140
  description="ASR + Text-to-Text Model + TTS with Human-like Voice and Emotions"
141
  )
142
 
143
- iface.launch(inline=False)
 
 
1
  import torch
2
  import torchaudio
3
  import gradio as gr
4
+ import soundfile as sf
5
  import wave
6
  import numpy as np
7
  from transformers import WhisperForCTC, WhisperProcessor, AutoModelForSeq2SeqLM, AutoTokenizer
 
29
  predicted_ids = torch.argmax(logits, dim=-1)
30
  transcription = processor_asr.batch_decode(predicted_ids)
31
  return transcription[0]
32
+
33
+ @spaces.GPU()
34
  # Text-to-text function
35
  def generate_response(text):
36
  inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
37
  outputs = text_model.generate(**inputs)
38
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
39
  return response
40
+
41
+ @spaces.GPU()
42
  # TTS function
43
  def synthesize_speech(text):
44
  inputs = tts_processor(text, return_tensors="pt")
 
46
  mel_outputs, mel_outputs_postnet, _, alignments = tts_model.inference(inputs.input_ids)
47
  audio = tts_model.infer(mel_outputs_postnet)
48
  return audio
49
+
50
+ @spaces.GPU()
51
  # Real-time processing function
52
  def real_time_pipeline():
53
+ # Adjust this part to handle live recording using soundfile and play back using simpleaudio
54
+ import simpleaudio as sa
55
+ import tempfile
56
+ import time
57
 
58
  wake_word = "hello mate"
59
  wake_word_detected = False
60
 
61
  print("Listening for wake word...")
62
 
63
+ with tempfile.NamedTemporaryFile(delete=False) as tmp_wav_file:
64
+ tmp_wav_path = tmp_wav_file.name
65
+
66
  try:
67
  while True:
68
+ # Capture audio here (this is a simplified example, you need actual audio capture logic)
69
+ time.sleep(2) # Simulate 2 seconds of audio capture
70
+
71
+ # Save the captured audio to the temp file for ASR
72
+ data, sample_rate = sf.read(tmp_wav_path)
73
+ sf.write(tmp_wav_path, data, sample_rate)
 
 
 
 
 
 
 
74
 
75
  # Step 1: Transcribe audio to text
76
+ transcription = transcribe(tmp_wav_path).lower()
77
 
78
  if wake_word in transcription:
79
  wake_word_detected = True
80
  print("Wake word detected. Processing audio...")
81
 
82
  while wake_word_detected:
83
+ # Capture audio here (this is a simplified example, you need actual audio capture logic)
84
+ time.sleep(2) # Simulate 2 seconds of audio capture
85
+
86
+ # Save the captured audio to the temp file for ASR
87
+ data, sample_rate = sf.read(tmp_wav_path)
88
+ sf.write(tmp_wav_path, data, sample_rate)
 
 
 
 
 
 
 
89
 
90
  # Step 1: Transcribe audio to text
91
+ transcription = transcribe(tmp_wav_path)
92
 
93
  # Step 2: Generate response using text-to-text model
94
  response = generate_response(transcription)
 
100
  output_path = "output.wav"
101
  torchaudio.save(output_path, synthesized_audio.squeeze(1), 22050)
102
 
103
+ # Play the synthesized audio using simpleaudio
104
+ wave_obj = sa.WaveObject.from_wave_file(output_path)
105
+ play_obj = wave_obj.play()
106
+ play_obj.wait_done()
 
 
 
 
 
 
 
 
 
 
107
  except KeyboardInterrupt:
108
  print("Stopping...")
 
 
 
 
109
 
110
  # Gradio interface
111
  gr_interface = gr.Interface(
 
117
  description="ASR + Text-to-Text Model + TTS with Human-like Voice and Emotions"
118
  )
119
 
120
+
121
+ iface.launch(inline=False)