ar08 commited on
Commit
15450b6
Β·
verified Β·
1 Parent(s): afd2929

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -38
app.py CHANGED
@@ -8,8 +8,6 @@ import tempfile
8
  import logging
9
  import io
10
  from pydub import AudioSegment
11
- import wave
12
- import pyaudio
13
 
14
  # Set up logging
15
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -30,23 +28,21 @@ headers = {"Authorization": f"Bearer {hf_token}"}
30
  chat_history = []
31
 
32
  async def text_to_speech_stream(text, voice_volume=1.0):
33
- """Convert text to speech using edge_tts and yield audio chunks."""
34
  communicate = edge_tts.Communicate(text, "en-US-BrianMultilingualNeural")
35
  audio_data = b""
36
 
37
  async for chunk in communicate.stream():
38
  if chunk["type"] == "audio":
39
  audio_data += chunk["data"]
40
- # Yield chunks for streaming
41
- yield audio_data
42
-
43
- # Adjust volume for the final audio
44
  audio = AudioSegment.from_mp3(io.BytesIO(audio_data))
45
  adjusted_audio = audio + (20 * voice_volume - 20) # Adjust volume (0.0 to 2.0)
46
-
47
- buffer = io.BytesIO()
48
- adjusted_audio.export(buffer, format="wav")
49
- yield buffer.getvalue()
50
 
51
  def whisper_speech_to_text(audio_path):
52
  """Convert speech to text using Hugging Face Whisper API."""
@@ -89,21 +85,23 @@ async def chat_with_ai(message):
89
  response_text = response.choices[0].message['content']
90
  chat_history.append({"role": "assistant", "content": response_text})
91
 
92
- return response_text
 
 
93
  except Exception as e:
94
  logging.error(f"Error in chat_with_ai: {e}")
95
- return str(e)
96
 
97
  def transcribe_and_chat(audio):
98
  if audio is None:
99
- return "Sorry, no audio was provided. Please try recording again."
100
 
101
  text = whisper_speech_to_text(audio)
102
  if not text:
103
- return "Sorry, I couldn't understand the audio or there was an error in transcription. Please try again."
104
 
105
- response = asyncio.run(chat_with_ai(text))
106
- return response
107
 
108
  def create_demo():
109
  with gr.Blocks(css="""
@@ -139,7 +137,7 @@ def create_demo():
139
 
140
  with gr.Column(scale=1):
141
  chat_output = gr.Textbox(label="πŸ’¬ AI Response", elem_id="chat-output", lines=5, interactive=False)
142
- audio_output = gr.Audio(label="πŸ”Š AI Voice Response", autoplay=True, elem_id="audio-output", streaming=True)
143
 
144
  # Add some spacing and a divider
145
  gr.Markdown("<hr style='border: 1px solid #8ecae6;'/>")
@@ -149,27 +147,13 @@ def create_demo():
149
  logging.info(f"Received audio: {audio}")
150
  if audio is None:
151
  return "No audio detected. Please try recording again.", None
152
- response = transcribe_and_chat(audio)
153
- logging.info(f"Response: {response}")
154
- return response, None
155
-
156
- def stream_audio(response, volume):
157
- def generate_audio():
158
- for chunk in asyncio.run(text_to_speech_stream(response, volume)):
159
- yield chunk
160
-
161
- return gr.Audio(generate_audio(), streaming=True)
162
-
163
- audio_input.change(
164
- process_audio,
165
- inputs=[audio_input, voice_volume],
166
- outputs=[chat_output, audio_output]
167
- ).then(
168
- stream_audio,
169
- inputs=[chat_output, voice_volume],
170
- outputs=audio_output
171
- )
172
 
 
173
  clear_button.click(lambda: (None, None), None, [chat_output, audio_output])
174
 
175
  # JavaScript to handle autoplay, automatic submission, and auto-listen
 
8
  import logging
9
  import io
10
  from pydub import AudioSegment
 
 
11
 
12
  # Set up logging
13
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 
28
  chat_history = []
29
 
30
  async def text_to_speech_stream(text, voice_volume=1.0):
31
+ """Convert text to speech using edge_tts and return the audio file path."""
32
  communicate = edge_tts.Communicate(text, "en-US-BrianMultilingualNeural")
33
  audio_data = b""
34
 
35
  async for chunk in communicate.stream():
36
  if chunk["type"] == "audio":
37
  audio_data += chunk["data"]
38
+
39
+ # Adjust volume
 
 
40
  audio = AudioSegment.from_mp3(io.BytesIO(audio_data))
41
  adjusted_audio = audio + (20 * voice_volume - 20) # Adjust volume (0.0 to 2.0)
42
+
43
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_file:
44
+ adjusted_audio.export(temp_file.name, format="mp3")
45
+ return temp_file.name
46
 
47
  def whisper_speech_to_text(audio_path):
48
  """Convert speech to text using Hugging Face Whisper API."""
 
85
  response_text = response.choices[0].message['content']
86
  chat_history.append({"role": "assistant", "content": response_text})
87
 
88
+ audio_path = await text_to_speech_stream(response_text)
89
+
90
+ return response_text, audio_path
91
  except Exception as e:
92
  logging.error(f"Error in chat_with_ai: {e}")
93
+ return str(e), None
94
 
95
  def transcribe_and_chat(audio):
96
  if audio is None:
97
+ return "Sorry, no audio was provided. Please try recording again.", None
98
 
99
  text = whisper_speech_to_text(audio)
100
  if not text:
101
+ return "Sorry, I couldn't understand the audio or there was an error in transcription. Please try again.", None
102
 
103
+ response, audio_path = asyncio.run(chat_with_ai(text))
104
+ return response, audio_path
105
 
106
  def create_demo():
107
  with gr.Blocks(css="""
 
137
 
138
  with gr.Column(scale=1):
139
  chat_output = gr.Textbox(label="πŸ’¬ AI Response", elem_id="chat-output", lines=5, interactive=False)
140
+ audio_output = gr.Audio(label="πŸ”Š AI Voice Response", autoplay=True, elem_id="audio-output")
141
 
142
  # Add some spacing and a divider
143
  gr.Markdown("<hr style='border: 1px solid #8ecae6;'/>")
 
147
  logging.info(f"Received audio: {audio}")
148
  if audio is None:
149
  return "No audio detected. Please try recording again.", None
150
+ response, audio_path = transcribe_and_chat(audio)
151
+ # Adjust volume for the response audio
152
+ adjusted_audio_path = asyncio.run(text_to_speech_stream(response, volume))
153
+ logging.info(f"Response: {response}, Audio path: {adjusted_audio_path}")
154
+ return response, adjusted_audio_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
 
156
+ audio_input.change(process_audio, inputs=[audio_input, voice_volume], outputs=[chat_output, audio_output])
157
  clear_button.click(lambda: (None, None), None, [chat_output, audio_output])
158
 
159
  # JavaScript to handle autoplay, automatic submission, and auto-listen