Krishnavamshithumma commited on
Commit
9126db3
·
verified ·
1 Parent(s): 12ef89e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -166
app.py CHANGED
@@ -2,20 +2,17 @@ import gradio as gr
2
  from openai import OpenAI
3
  import speech_recognition as sr
4
  import os
5
- import io # For in-memory file handling
6
- import scipy.io.wavfile as wavfile # For writing/reading WAV data to/from in-memory file
7
- import numpy as np # To handle the audio array
8
- import datetime # For logging timestamps (not directly used in this version)
9
-
10
- # --- Fetch API Key from Environment Variable ---
11
- # This is the SECURE way to handle API keys in Hugging Face Spaces.
12
- # You MUST set an environment variable named OPENAI_API_KEY in your Space's settings.
13
- OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
14
 
15
- # --- Define the OpenAI Models to use ---
16
- OPENAI_STT_MODEL = "whisper-1" # Using Whisper for Speech-to-Text
17
- OPENAI_CHAT_MODEL = "gpt-3.5-turbo" # Using GPT-3.5 Turbo for chat responses
18
- OPENAI_TTS_MODEL = "tts-1" # Using TTS-1 for Text-to-Speech
 
19
 
20
  system_prompt = """
21
  You are a sophisticated AI voice bot representing Krishnavamshi Thumma. Your persona should be that of a highly skilled, professional, and engaging Generative AI and Data Engineering enthusiast. When responding to questions, embody the following detailed professional identity:
@@ -55,214 +52,112 @@ system_prompt = """
55
  # Initialize the SpeechRecognition Recognizer
56
  r = sr.Recognizer()
57
 
58
- # Modified function to accept audio as a numpy array and samplerate
59
  def transcribe_audio_and_chat(audio_tuple, history):
60
- # Check if API key is available in environment
61
  if not OPENAI_API_KEY:
62
- raise gr.Error("❌ OpenAI API key not found. Please set OPENAI_API_KEY as a Space Secret.")
63
 
64
- # Handle cases where history might be None (defensive programming)
65
  if history is None:
66
  history = []
67
 
68
- # Initialize tts_audio_output to None, so we always return it
69
- tts_audio_output = None
70
 
71
  if audio_tuple is None:
72
- # If no audio, raise a Gradio Error directly instead of adding to chat history
73
- # Return history, history, None, None to clear inputs/outputs appropriately
74
- return history, history, None, None
75
 
76
  samplerate, audio_np_array = audio_tuple
77
 
78
  try:
79
- # Convert the NumPy array to a format speech_recognition can handle (in-memory WAV)
80
  if audio_np_array.dtype != np.int16:
81
- audio_np_array = audio_np_array.astype(np.int16)
82
-
83
- wav_byte_io = io.BytesIO()
84
- wavfile.write(wav_byte_io, samplerate, audio_np_array)
85
- wav_byte_io.seek(0) # Rewind to the beginning of the BytesIO object
86
 
87
- # Create an AudioFile object from the in-memory WAV data
88
- with sr.AudioFile(wav_byte_io) as source:
89
- audio_data = r.record(source) # read the entire audio file
 
90
 
91
- # --- Speech-to-Text (STT) ---
92
- try:
93
- # Using OpenAI's Whisper model for STT
94
  client = OpenAI(api_key=OPENAI_API_KEY)
95
- # OpenAI's Whisper API typically expects audio in certain formats.
96
- # While speech_recognition handles BytesIO, OpenAI's client.audio.transcriptions.create
97
- # might prefer a direct file-like object or a path.
98
- # For simplicity with BytesIO, we'll try to use speech_recognition's built-in recognizer.
99
- # If you want to use OpenAI's ASR directly (e.g., Whisper), you'd need to adapt.
100
- # For this code, we're sticking with `recognize_google` which uses Google's API by default.
101
- user_input = r.recognize_google(audio_data) # This uses Google's STT (free tier usually)
102
-
103
- # If you wanted to use OpenAI's Whisper ASR here, you'd do:
104
- # audio_file_for_whisper = io.BytesIO(wav_byte_io.getvalue()) # Reset stream for Whisper
105
- # audio_file_for_whisper.name = "audio.wav" # Whisper API needs a filename for BytesIO
106
- # transcript = client.audio.transcriptions.create(
107
- # model=OPENAI_STT_MODEL, # "whisper-1"
108
- # file=audio_file_for_whisper
109
- # )
110
- # user_input = transcript.text
111
-
112
- print(f"Transcribed User Input: {user_input}") # For debugging purposes
113
-
114
- except sr.UnknownValueError:
115
- history.append({"role": "assistant", "content": "Sorry, I could not understand the audio. Please try again."})
116
- return history, history, None, tts_audio_output # Still clear inputs/outputs
117
- except sr.RequestError as e:
118
- history.append({"role": "assistant", "content": f"Could not request results from Speech Recognition service; {e}"})
119
- return history, history, None, tts_audio_output # Still clear inputs/outputs
120
-
121
- # --- Chat Completion ---
122
- client = OpenAI(api_key=OPENAI_API_KEY)
123
 
 
124
  messages_for_openai = [{"role": "system", "content": system_prompt}] + history
125
  messages_for_openai.append({"role": "user", "content": user_input})
126
 
127
- response = client.chat.completions.create(
128
  model=OPENAI_CHAT_MODEL,
129
  messages=messages_for_openai,
130
  temperature=0.7
131
  )
132
 
133
- bot_reply = response.choices[0].message.content
134
-
135
  history.append({"role": "user", "content": user_input})
136
  history.append({"role": "assistant", "content": bot_reply})
137
-
138
- # --- Text-to-Speech (TTS) ---
139
  try:
140
  tts_response = client.audio.speech.create(
141
- model=OPENAI_TTS_MODEL, # "tts-1"
142
- voice="alloy", # You can choose from "alloy", "echo", "fable", "onyx", "nova", "shimmer"
143
  input=bot_reply,
144
- response_format="wav" # Request WAV format for easy in-memory processing
145
  )
146
-
147
- # Read the audio stream into a BytesIO object
148
- tts_audio_bytes = io.BytesIO()
149
- for chunk in tts_response.iter_bytes(chunk_size=4096):
150
- tts_audio_bytes.write(chunk)
151
- tts_audio_bytes.seek(0) # Rewind for reading
152
 
153
- # Read the WAV data using scipy
154
- tts_samplerate, tts_numpy_array = wavfile.read(tts_audio_bytes)
155
- tts_audio_output = (tts_samplerate, tts_numpy_array) # Format for gr.Audio(type="numpy") output
 
156
 
157
  except Exception as tts_e:
158
- print(f"Error generating TTS: {tts_e}")
159
- # If TTS fails, log the error but don't stop the chat.
160
- # The TTS audio output will just be None.
161
- tts_audio_output = None # Ensure it's None if there's an error
162
- history.append({"role": "assistant", "content": "(Voice generation failed.)"}) # Optional: notify user
163
 
164
- # Return all required outputs: chatbot history, state history, cleared audio input, TTS audio
165
- return history, history, None, tts_audio_output
166
 
167
  except Exception as e:
168
- print(f"An unexpected error occurred: {e}")
169
- # Ensure all outputs are returned even on a general error
170
- raise gr.Error(f"❌ An unexpected error occurred: {str(e)}")
171
-
172
 
173
- # --- Gradio UI setup ---
174
  with gr.Blocks(title="Voice Bot: Krishnavamshi Thumma") as demo:
175
  gr.Markdown("## 🎙️ Krishnavamshi Thumma - Voice Assistant")
176
 
177
- gr.HTML("""
178
- <style>
179
- #chatBox {
180
- height: 60vh;
181
- overflow-y: auto;
182
- padding: 20px;
183
- border-radius: 10px;
184
- background: #f9f9f9;
185
- margin-bottom: 20px;
186
- }
187
- .message {
188
- margin: 10px 0;
189
- padding: 12px;
190
- border-radius: 8px;
191
- }
192
- .user {
193
- background: #e3f2fd;
194
- text-align: right;
195
- }
196
- .bot {
197
- background: #f5f5f5;
198
- }
199
- #audioInputComponent {
200
- margin-top: 20px;
201
- }
202
- .key-status { /* Not strictly needed anymore but keeping for style consistency if other status messages arise */
203
- padding: 5px;
204
- margin-top: 5px;
205
- border-radius: 4px;
206
- }
207
- .success {
208
- background: #d4edda;
209
- color: #155724;
210
- }
211
- .error {
212
- background: #f8d7da;
213
- color: #721c24;
214
- }
215
- </style>
216
- """)
217
-
218
- # --- UI Components ---
219
- # Chatbot component to display messages
220
- chatbot = gr.Chatbot(elem_id="chatBox", type="messages", height=400)
221
- # State component to maintain chat history in OpenAI's message format
222
- state = gr.State([])
223
-
224
- # Audio input component for microphone recording
225
  audio_input = gr.Audio(
226
  sources=["microphone"],
227
- type="numpy", # Receive audio as (samplerate, numpy_array)
228
  label="Speak your message here",
229
- elem_id="audioInputComponent",
230
- streaming=False # Process audio after full recording
231
  )
232
 
233
- # New: Audio output component for TTS playback
234
  tts_audio_output = gr.Audio(
235
  label="Bot's Voice Response",
236
- type="numpy", # Expects (samplerate, numpy_array) for playback
237
- autoplay=True, # Automatically play the audio
238
- waveform_options={
239
- "skip_length": 0,
240
- "waveform_color": "#2196F3",
241
- "waveform_progress_color": "#4CAF50",
242
- # Removed 'cursor_color' and 'unfilled_waveform_color' as they are not standard options here
243
- }
244
  )
245
 
246
  clear_btn = gr.Button("🗑️ Clear Chat")
247
 
248
- # Event handler for audio input change
249
  audio_input.change(
250
  fn=transcribe_audio_and_chat,
251
- inputs=[audio_input, state], # api_key is now global
252
- # Outputs: 1. chatbot display, 2. state (updated history),
253
- # 3. audio_input (to clear it), 4. tts_audio_output (for playing bot's voice)
254
- outputs=[chatbot, state, audio_input, tts_audio_output]
255
  )
256
 
257
- # JavaScript (no changes needed for API key part here as it's removed)
258
- gr.HTML("""
259
- <script>
260
- // You can add other useful JS here if needed in the future
261
- </script>
262
- """)
263
-
264
- # Clear button functionality: resets chatbot and state to empty
265
- # Also clear the TTS audio output when chat is cleared
266
  clear_btn.click(lambda: ([], [], None), None, [chatbot, state, tts_audio_output])
267
 
268
- demo.launch()
 
 
 
 
2
  from openai import OpenAI
3
  import speech_recognition as sr
4
  import os
5
+ import io
6
+ import tempfile
7
+ import scipy.io.wavfile as wavfile
8
+ import numpy as np
9
+ import datetime
 
 
 
 
10
 
11
+ # Load API key from environment
12
+ OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
13
+ OPENAI_STT_MODEL = "whisper-1"
14
+ OPENAI_CHAT_MODEL = "gpt-3.5-turbo"
15
+ OPENAI_TTS_MODEL = "tts-1"
16
 
17
  system_prompt = """
18
  You are a sophisticated AI voice bot representing Krishnavamshi Thumma. Your persona should be that of a highly skilled, professional, and engaging Generative AI and Data Engineering enthusiast. When responding to questions, embody the following detailed professional identity:
 
52
  # Initialize the SpeechRecognition Recognizer
53
  r = sr.Recognizer()
54
 
 
55
  def transcribe_audio_and_chat(audio_tuple, history):
 
56
  if not OPENAI_API_KEY:
57
+ raise gr.Error("❌ OpenAI API key not found.")
58
 
 
59
  if history is None:
60
  history = []
61
 
62
+ audio_output_path = None # Default output path to return (for TTS playback)
 
63
 
64
  if audio_tuple is None:
65
+ return history, history, None, None
 
 
66
 
67
  samplerate, audio_np_array = audio_tuple
68
 
69
  try:
 
70
  if audio_np_array.dtype != np.int16:
71
+ audio_np_array = audio_np_array.astype(np.int16)
 
 
 
 
72
 
73
+ # Save user audio temporarily for Whisper
74
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as temp_audio_file:
75
+ wavfile.write(temp_audio_file.name, samplerate, audio_np_array)
76
+ temp_audio_file.flush()
77
 
78
+ # Use OpenAI Whisper STT
 
 
79
  client = OpenAI(api_key=OPENAI_API_KEY)
80
+ with open(temp_audio_file.name, "rb") as file:
81
+ transcript = client.audio.transcriptions.create(
82
+ model=OPENAI_STT_MODEL,
83
+ file=file
84
+ )
85
+ user_input = transcript.text
86
+
87
+ print(f"Transcribed Input: {user_input}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
+ # Chat Completion
90
  messages_for_openai = [{"role": "system", "content": system_prompt}] + history
91
  messages_for_openai.append({"role": "user", "content": user_input})
92
 
93
+ chat_response = client.chat.completions.create(
94
  model=OPENAI_CHAT_MODEL,
95
  messages=messages_for_openai,
96
  temperature=0.7
97
  )
98
 
99
+ bot_reply = chat_response.choices[0].message.content
100
+
101
  history.append({"role": "user", "content": user_input})
102
  history.append({"role": "assistant", "content": bot_reply})
103
+
104
+ # Generate TTS audio and save to temp file
105
  try:
106
  tts_response = client.audio.speech.create(
107
+ model=OPENAI_TTS_MODEL,
108
+ voice="alloy",
109
  input=bot_reply,
110
+ response_format="mp3"
111
  )
 
 
 
 
 
 
112
 
113
+ with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as tts_temp_file:
114
+ for chunk in tts_response.iter_bytes():
115
+ tts_temp_file.write(chunk)
116
+ audio_output_path = tts_temp_file.name
117
 
118
  except Exception as tts_e:
119
+ print(f"Error in TTS: {tts_e}")
120
+ history.append({"role": "assistant", "content": bot_reply + " (Voice failed to generate.)"})
121
+ audio_output_path = None
 
 
122
 
123
+ return history, history, None, audio_output_path
 
124
 
125
  except Exception as e:
126
+ print(f"Unexpected error: {e}")
127
+ raise gr.Error(f"❌ Unexpected error: {str(e)}")
 
 
128
 
129
+ # Gradio UI
130
  with gr.Blocks(title="Voice Bot: Krishnavamshi Thumma") as demo:
131
  gr.Markdown("## 🎙️ Krishnavamshi Thumma - Voice Assistant")
132
 
133
+ chatbot = gr.Chatbot(type="messages", height=400)
134
+ state = gr.State([])
135
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  audio_input = gr.Audio(
137
  sources=["microphone"],
138
+ type="numpy",
139
  label="Speak your message here",
140
+ streaming=False
 
141
  )
142
 
143
+ # Output as file path (so Gradio can handle autoplay correctly)
144
  tts_audio_output = gr.Audio(
145
  label="Bot's Voice Response",
146
+ type="filepath",
147
+ autoplay=True
 
 
 
 
 
 
148
  )
149
 
150
  clear_btn = gr.Button("🗑️ Clear Chat")
151
 
 
152
  audio_input.change(
153
  fn=transcribe_audio_and_chat,
154
+ inputs=[audio_input, state],
155
+ outputs=[chatbot, state, audio_input, tts_audio_output]
 
 
156
  )
157
 
 
 
 
 
 
 
 
 
 
158
  clear_btn.click(lambda: ([], [], None), None, [chatbot, state, tts_audio_output])
159
 
160
+ demo.launch()
161
+
162
+
163
+