Krishnavamshithumma commited on
Commit
5a55703
·
verified ·
1 Parent(s): a8d175f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +93 -36
app.py CHANGED
@@ -3,17 +3,19 @@ from openai import OpenAI
3
  import speech_recognition as sr
4
  import os
5
  import io # For in-memory file handling
6
- import scipy.io.wavfile as wavfile # For writing WAV data to in-memory file
7
  import numpy as np # To handle the audio array
8
- import datetime # For logging timestamps if needed (not directly used in this version, but good practice)
9
 
10
  # --- Fetch API Key from Environment Variable ---
11
  # This is the SECURE way to handle API keys in Hugging Face Spaces.
12
  # You MUST set an environment variable named OPENAI_API_KEY in your Space's settings.
13
  OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
14
 
15
- # --- Define the OpenAI Model to use ---
16
- OPENAI_MODEL = "gpt-3.5-turbo" # Changed from gpt-4o to gpt-3.5-turbo
 
 
17
 
18
  system_prompt = """
19
  You are a sophisticated AI voice bot representing Krishnavamshi Thumma. Your persona should be that of a highly skilled, professional, and engaging Generative AI and Data Engineering enthusiast. When responding to questions, embody the following detailed professional identity:
@@ -22,7 +24,7 @@ system_prompt = """
22
  You possess 1.5+ years of hands-on experience in data pipelines, automation, and scalable solutions. Your expertise specifically extends to building cutting-edge Generative AI products, utilizing advanced techniques like Large Language Models (LLMs), Retrieval-Augmented Generation (RAG) pipelines, various vector databases, and deep learning models. You are known for your proven ability to take full ownership, driving end-to-end AI product development from initial concept through to successful deployment. At your core, you are passionate about leveraging the intersection of AI and software engineering to solve real-world problems.
23
 
24
  **Current Role & Key Contributions (Wishkarma):**
25
- Currently, you are serving as a Data Engineer at Wishkarma in Hyderabad, India, a role you've held since May 2024. In this position, you have been instrumental in designing and optimizing scalable ETL pipelines primarily using Python and MongoDB, efficiently processing over 10,000 records daily while maintaining an impressive 99.9% data accuracy. You've developed and automated crucial data workflows utilizing Apache Airflow and AWS Lambda, which has significantly reduced manual intervention by 30% and boosted pipeline efficiency by 40%. A notable achievement includes leading the creation of a data refresh system based on source URLs, which streamlined product updates and saved over 20 hours per month. Furthermore, you implemented an innovative image-based product similarity search engine, leveraging CLIP-ViT-L/14, MongoDB Vector Search, and AWS S3. This initiative remarkably increased product discoverability by 35% and cut manual tagging efforts by 50%.
26
 
27
  **Previous Experience (DeepThought Growth Management System):**
28
  Prior to Wishkarma, you gained valuable experience as a Data Engineer Intern at DeepThought Growth Management System in Hyderabad, from November 2023 to June 2024. Here, you successfully processed more than 700 data records using MongoDB aggregations, ensuring 100% data integrity. Beyond technical tasks, you actively contributed to community and education by conducting over 50 technical workshops focused on data-driven decision-making, increasing engagement by 30%. You also mentored more than 400 students in crucial problem-solving frameworks like Design Thinking and MVP, which led to a 40% improvement in project completion rates.
@@ -54,7 +56,7 @@ system_prompt = """
54
  r = sr.Recognizer()
55
 
56
  # Modified function to accept audio as a numpy array and samplerate
57
- def transcribe_audio_and_chat(audio_tuple, history): # Removed api_key from function arguments
58
  # Check if API key is available in environment
59
  if not OPENAI_API_KEY:
60
  raise gr.Error("❌ OpenAI API key not found. Please set OPENAI_API_KEY as a Space Secret.")
@@ -63,9 +65,13 @@ def transcribe_audio_and_chat(audio_tuple, history): # Removed api_key from func
63
  if history is None:
64
  history = []
65
 
 
 
 
66
  if audio_tuple is None:
67
  # If no audio, raise a Gradio Error directly instead of adding to chat history
68
- return history, history, None
 
69
 
70
  samplerate, audio_np_array = audio_tuple
71
 
@@ -82,46 +88,85 @@ def transcribe_audio_and_chat(audio_tuple, history): # Removed api_key from func
82
  with sr.AudioFile(wav_byte_io) as source:
83
  audio_data = r.record(source) # read the entire audio file
84
 
85
- # Perform speech recognition
86
  try:
87
- user_input = r.recognize_google(audio_data) # Using Google Web Speech API
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  print(f"Transcribed User Input: {user_input}") # For debugging purposes
89
 
90
  except sr.UnknownValueError:
91
  history.append({"role": "assistant", "content": "Sorry, I could not understand the audio. Please try again."})
92
- return history, history, None # Reset audio input after error
93
  except sr.RequestError as e:
94
- history.append({"role": "assistant", "content": f"Could not request results from Google Speech Recognition service; {e}"})
95
- return history, history, None # Reset audio input after error
96
 
97
- # --- Proceed with OpenAI chat ---
98
- # Use the global OPENAI_API_KEY
99
  client = OpenAI(api_key=OPENAI_API_KEY)
100
 
101
- # Create the full messages list for OpenAI, starting with the system prompt
102
  messages_for_openai = [{"role": "system", "content": system_prompt}] + history
103
  messages_for_openai.append({"role": "user", "content": user_input})
104
 
105
- # Get response from OpenAI using the specified model
106
  response = client.chat.completions.create(
107
- model=OPENAI_MODEL, # Use the global OPENAI_MODEL
108
- messages=messages_for_openai, # Pass the correctly formatted messages
109
  temperature=0.7
110
  )
111
 
112
  bot_reply = response.choices[0].message.content
113
 
114
- # Append both the user input and bot reply to the *Gradio* history (state)
115
  history.append({"role": "user", "content": user_input})
116
  history.append({"role": "assistant", "content": bot_reply})
117
 
118
- # Return the updated history for the chatbot component,
119
- # history again for the 'state' component,
120
- # and None for the audio input to clear it and make it ready for next input.
121
- return history, history, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
 
123
  except Exception as e:
124
- print(f"An unexpected error occurred: {e}") # Log the error for debugging
 
125
  raise gr.Error(f"❌ An unexpected error occurred: {str(e)}")
126
 
127
 
@@ -154,7 +199,7 @@ with gr.Blocks(title="Voice Bot: Krishnavamshi Thumma") as demo:
154
  #audioInputComponent {
155
  margin-top: 20px;
156
  }
157
- .key-status { /* No longer strictly needed but keeping for style consistency if other status messages arise */
158
  padding: 5px;
159
  margin-top: 5px;
160
  border-radius: 4px;
@@ -170,10 +215,7 @@ with gr.Blocks(title="Voice Bot: Krishnavamshi Thumma") as demo:
170
  </style>
171
  """)
172
 
173
- # Removed the API key textbox and its status display as it's now handled by Space Secrets
174
- # api_key = gr.Textbox(...)
175
- # key_status = gr.HTML(...)
176
-
177
  # Chatbot component to display messages
178
  chatbot = gr.Chatbot(elem_id="chatBox", type="messages", height=400)
179
  # State component to maintain chat history in OpenAI's message format
@@ -188,25 +230,40 @@ with gr.Blocks(title="Voice Bot: Krishnavamshi Thumma") as demo:
188
  streaming=False # Process audio after full recording
189
  )
190
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
  clear_btn = gr.Button("🗑️ Clear Chat")
192
 
193
- # Event handler: When audio input is recorded and submitted (by stopping recording)
194
  audio_input.change(
195
  fn=transcribe_audio_and_chat,
196
- inputs=[audio_input, state], # Removed api_key from inputs as it's global
197
- # Outputs: 1. chatbot display, 2. state (updated history), 3. audio_input (to clear it)
198
- outputs=[chatbot, state, audio_input]
 
199
  )
200
 
201
- # Removed JavaScript related to API key input
202
  gr.HTML("""
203
  <script>
204
- // No specific API key JS needed anymore as it's handled by secrets
205
  // You can add other useful JS here if needed in the future
206
  </script>
207
  """)
208
 
209
  # Clear button functionality: resets chatbot and state to empty
210
- clear_btn.click(lambda: ([], []), None, [chatbot, state])
 
211
 
212
  demo.launch()
 
3
  import speech_recognition as sr
4
  import os
5
  import io # For in-memory file handling
6
+ import scipy.io.wavfile as wavfile # For writing/reading WAV data to/from in-memory file
7
  import numpy as np # To handle the audio array
8
+ import datetime # For logging timestamps (not directly used in this version)
9
 
10
  # --- Fetch API Key from Environment Variable ---
11
  # This is the SECURE way to handle API keys in Hugging Face Spaces.
12
  # You MUST set an environment variable named OPENAI_API_KEY in your Space's settings.
13
  OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
14
 
15
+ # --- Define the OpenAI Models to use ---
16
+ OPENAI_STT_MODEL = "whisper-1" # Using Whisper for Speech-to-Text
17
+ OPENAI_CHAT_MODEL = "gpt-3.5-turbo" # Using GPT-3.5 Turbo for chat responses
18
+ OPENAI_TTS_MODEL = "tts-1" # Using TTS-1 for Text-to-Speech
19
 
20
  system_prompt = """
21
  You are a sophisticated AI voice bot representing Krishnavamshi Thumma. Your persona should be that of a highly skilled, professional, and engaging Generative AI and Data Engineering enthusiast. When responding to questions, embody the following detailed professional identity:
 
24
  You possess 1.5+ years of hands-on experience in data pipelines, automation, and scalable solutions. Your expertise specifically extends to building cutting-edge Generative AI products, utilizing advanced techniques like Large Language Models (LLMs), Retrieval-Augmented Generation (RAG) pipelines, various vector databases, and deep learning models. You are known for your proven ability to take full ownership, driving end-to-end AI product development from initial concept through to successful deployment. At your core, you are passionate about leveraging the intersection of AI and software engineering to solve real-world problems.
25
 
26
  **Current Role & Key Contributions (Wishkarma):**
27
+ Currently, you are serving as a Data Engineer at Wishkarma in Hyderabad, India, a role you've held since May 2024. In this position, you have been instrumental in designing and optimizing scalable ETL pipelines primarily using Python and MongoDB, efficiently processing over 10,000 records daily while maintaining an impressive 99.9% data accuracy. You've developed and automated crucial data workflows utilizing Apache Airflow and AWS Lambda, which has significantly reduced manual intervention by 30% and boosted pipeline efficiency by 40%. A notable achievement includes leading the creation of a data refresh system based on source URLs, which streamlined product updates and saving over 20 hours per month. Furthermore, you implemented an innovative image-based product similarity search engine, leveraging CLIP-ViT-L/14, MongoDB Vector Search, and AWS S3. This initiative remarkably increased product discoverability by 35% and cut manual tagging efforts by 50%.
28
 
29
  **Previous Experience (DeepThought Growth Management System):**
30
  Prior to Wishkarma, you gained valuable experience as a Data Engineer Intern at DeepThought Growth Management System in Hyderabad, from November 2023 to June 2024. Here, you successfully processed more than 700 data records using MongoDB aggregations, ensuring 100% data integrity. Beyond technical tasks, you actively contributed to community and education by conducting over 50 technical workshops focused on data-driven decision-making, increasing engagement by 30%. You also mentored more than 400 students in crucial problem-solving frameworks like Design Thinking and MVP, which led to a 40% improvement in project completion rates.
 
56
  r = sr.Recognizer()
57
 
58
  # Modified function to accept audio as a numpy array and samplerate
59
+ def transcribe_audio_and_chat(audio_tuple, history):
60
  # Check if API key is available in environment
61
  if not OPENAI_API_KEY:
62
  raise gr.Error("❌ OpenAI API key not found. Please set OPENAI_API_KEY as a Space Secret.")
 
65
  if history is None:
66
  history = []
67
 
68
+ # Initialize tts_audio_output to None, so we always return it
69
+ tts_audio_output = None
70
+
71
  if audio_tuple is None:
72
  # If no audio, raise a Gradio Error directly instead of adding to chat history
73
+ # Return history, history, None, None to clear inputs/outputs appropriately
74
+ return history, history, None, None
75
 
76
  samplerate, audio_np_array = audio_tuple
77
 
 
88
  with sr.AudioFile(wav_byte_io) as source:
89
  audio_data = r.record(source) # read the entire audio file
90
 
91
+ # --- Speech-to-Text (STT) ---
92
  try:
93
+ # Using OpenAI's Whisper model for STT
94
+ client = OpenAI(api_key=OPENAI_API_KEY)
95
+ # OpenAI's Whisper API typically expects audio in certain formats.
96
+ # While speech_recognition handles BytesIO, OpenAI's client.audio.transcriptions.create
97
+ # might prefer a direct file-like object or a path.
98
+ # For simplicity with BytesIO, we'll try to use speech_recognition's built-in recognizer.
99
+ # If you want to use OpenAI's ASR directly (e.g., Whisper), you'd need to adapt.
100
+ # For this code, we're sticking with `recognize_google` which uses Google's API by default.
101
+ user_input = r.recognize_google(audio_data) # This uses Google's STT (free tier usually)
102
+
103
+ # If you wanted to use OpenAI's Whisper ASR here, you'd do:
104
+ # audio_file_for_whisper = io.BytesIO(wav_byte_io.getvalue()) # Reset stream for Whisper
105
+ # audio_file_for_whisper.name = "audio.wav" # Whisper API needs a filename for BytesIO
106
+ # transcript = client.audio.transcriptions.create(
107
+ # model=OPENAI_STT_MODEL, # "whisper-1"
108
+ # file=audio_file_for_whisper
109
+ # )
110
+ # user_input = transcript.text
111
+
112
  print(f"Transcribed User Input: {user_input}") # For debugging purposes
113
 
114
  except sr.UnknownValueError:
115
  history.append({"role": "assistant", "content": "Sorry, I could not understand the audio. Please try again."})
116
+ return history, history, None, tts_audio_output # Still clear inputs/outputs
117
  except sr.RequestError as e:
118
+ history.append({"role": "assistant", "content": f"Could not request results from Speech Recognition service; {e}"})
119
+ return history, history, None, tts_audio_output # Still clear inputs/outputs
120
 
121
+ # --- Chat Completion ---
 
122
  client = OpenAI(api_key=OPENAI_API_KEY)
123
 
 
124
  messages_for_openai = [{"role": "system", "content": system_prompt}] + history
125
  messages_for_openai.append({"role": "user", "content": user_input})
126
 
 
127
  response = client.chat.completions.create(
128
+ model=OPENAI_CHAT_MODEL,
129
+ messages=messages_for_openai,
130
  temperature=0.7
131
  )
132
 
133
  bot_reply = response.choices[0].message.content
134
 
 
135
  history.append({"role": "user", "content": user_input})
136
  history.append({"role": "assistant", "content": bot_reply})
137
 
138
+ # --- Text-to-Speech (TTS) ---
139
+ try:
140
+ tts_response = client.audio.speech.create(
141
+ model=OPENAI_TTS_MODEL, # "tts-1"
142
+ voice="alloy", # You can choose from "alloy", "echo", "fable", "onyx", "nova", "shimmer"
143
+ input=bot_reply,
144
+ response_format="wav" # Request WAV format for easy in-memory processing
145
+ )
146
+
147
+ # Read the audio stream into a BytesIO object
148
+ tts_audio_bytes = io.BytesIO()
149
+ for chunk in tts_response.iter_bytes(chunk_size=4096):
150
+ tts_audio_bytes.write(chunk)
151
+ tts_audio_bytes.seek(0) # Rewind for reading
152
+
153
+ # Read the WAV data using scipy
154
+ tts_samplerate, tts_numpy_array = wavfile.read(tts_audio_bytes)
155
+ tts_audio_output = (tts_samplerate, tts_numpy_array) # Format for gr.Audio(type="numpy") output
156
+
157
+ except Exception as tts_e:
158
+ print(f"Error generating TTS: {tts_e}")
159
+ # If TTS fails, log the error but don't stop the chat.
160
+ # The TTS audio output will just be None.
161
+ tts_audio_output = None # Ensure it's None if there's an error
162
+ history.append({"role": "assistant", "content": "(Voice generation failed.)"}) # Optional: notify user
163
+
164
+ # Return all required outputs: chatbot history, state history, cleared audio input, TTS audio
165
+ return history, history, None, tts_audio_output
166
 
167
  except Exception as e:
168
+ print(f"An unexpected error occurred: {e}")
169
+ # Ensure all outputs are returned even on a general error
170
  raise gr.Error(f"❌ An unexpected error occurred: {str(e)}")
171
 
172
 
 
199
  #audioInputComponent {
200
  margin-top: 20px;
201
  }
202
+ .key-status { /* Not strictly needed anymore but keeping for style consistency if other status messages arise */
203
  padding: 5px;
204
  margin-top: 5px;
205
  border-radius: 4px;
 
215
  </style>
216
  """)
217
 
218
+ # --- UI Components ---
 
 
 
219
  # Chatbot component to display messages
220
  chatbot = gr.Chatbot(elem_id="chatBox", type="messages", height=400)
221
  # State component to maintain chat history in OpenAI's message format
 
230
  streaming=False # Process audio after full recording
231
  )
232
 
233
+ # New: Audio output component for TTS playback
234
+ tts_audio_output = gr.Audio(
235
+ label="Bot's Voice Response",
236
+ type="numpy", # Expects (samplerate, numpy_array) for playback
237
+ autoplay=True, # Automatically play the audio
238
+ waveform_options={
239
+ "skip_length_milliseconds": 0,
240
+ "cursor_color": "#000000",
241
+ "wave_color": "#2196F3",
242
+ "wave_progress_color": "#4CAF50",
243
+ "unfilled_waveform_color": "#E0E0E0"
244
+ }
245
+ )
246
+
247
  clear_btn = gr.Button("🗑️ Clear Chat")
248
 
249
+ # Event handler for audio input change
250
  audio_input.change(
251
  fn=transcribe_audio_and_chat,
252
+ inputs=[audio_input, state], # api_key is now global
253
+ # Outputs: 1. chatbot display, 2. state (updated history),
254
+ # 3. audio_input (to clear it), 4. tts_audio_output (for playing bot's voice)
255
+ outputs=[chatbot, state, audio_input, tts_audio_output]
256
  )
257
 
258
+ # JavaScript (no changes needed for API key part here as it's removed)
259
  gr.HTML("""
260
  <script>
 
261
  // You can add other useful JS here if needed in the future
262
  </script>
263
  """)
264
 
265
  # Clear button functionality: resets chatbot and state to empty
266
+ # Also clear the TTS audio output when chat is cleared
267
+ clear_btn.click(lambda: ([], [], None), None, [chatbot, state, tts_audio_output])
268
 
269
  demo.launch()