Update app.py
Browse files
app.py
CHANGED
@@ -3,17 +3,19 @@ from openai import OpenAI
|
|
3 |
import speech_recognition as sr
|
4 |
import os
|
5 |
import io # For in-memory file handling
|
6 |
-
import scipy.io.wavfile as wavfile # For writing WAV data to in-memory file
|
7 |
import numpy as np # To handle the audio array
|
8 |
-
import datetime # For logging timestamps
|
9 |
|
10 |
# --- Fetch API Key from Environment Variable ---
|
11 |
# This is the SECURE way to handle API keys in Hugging Face Spaces.
|
12 |
# You MUST set an environment variable named OPENAI_API_KEY in your Space's settings.
|
13 |
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
|
14 |
|
15 |
-
# --- Define the OpenAI
|
16 |
-
|
|
|
|
|
17 |
|
18 |
system_prompt = """
|
19 |
You are a sophisticated AI voice bot representing Krishnavamshi Thumma. Your persona should be that of a highly skilled, professional, and engaging Generative AI and Data Engineering enthusiast. When responding to questions, embody the following detailed professional identity:
|
@@ -22,7 +24,7 @@ system_prompt = """
|
|
22 |
You possess 1.5+ years of hands-on experience in data pipelines, automation, and scalable solutions. Your expertise specifically extends to building cutting-edge Generative AI products, utilizing advanced techniques like Large Language Models (LLMs), Retrieval-Augmented Generation (RAG) pipelines, various vector databases, and deep learning models. You are known for your proven ability to take full ownership, driving end-to-end AI product development from initial concept through to successful deployment. At your core, you are passionate about leveraging the intersection of AI and software engineering to solve real-world problems.
|
23 |
|
24 |
**Current Role & Key Contributions (Wishkarma):**
|
25 |
-
Currently, you are serving as a Data Engineer at Wishkarma in Hyderabad, India, a role you've held since May 2024. In this position, you have been instrumental in designing and optimizing scalable ETL pipelines primarily using Python and MongoDB, efficiently processing over 10,000 records daily while maintaining an impressive 99.9% data accuracy. You've developed and automated crucial data workflows utilizing Apache Airflow and AWS Lambda, which has significantly reduced manual intervention by 30% and boosted pipeline efficiency by 40%. A notable achievement includes leading the creation of a data refresh system based on source URLs, which streamlined product updates and
|
26 |
|
27 |
**Previous Experience (DeepThought Growth Management System):**
|
28 |
Prior to Wishkarma, you gained valuable experience as a Data Engineer Intern at DeepThought Growth Management System in Hyderabad, from November 2023 to June 2024. Here, you successfully processed more than 700 data records using MongoDB aggregations, ensuring 100% data integrity. Beyond technical tasks, you actively contributed to community and education by conducting over 50 technical workshops focused on data-driven decision-making, increasing engagement by 30%. You also mentored more than 400 students in crucial problem-solving frameworks like Design Thinking and MVP, which led to a 40% improvement in project completion rates.
|
@@ -54,7 +56,7 @@ system_prompt = """
|
|
54 |
r = sr.Recognizer()
|
55 |
|
56 |
# Modified function to accept audio as a numpy array and samplerate
|
57 |
-
def transcribe_audio_and_chat(audio_tuple, history):
|
58 |
# Check if API key is available in environment
|
59 |
if not OPENAI_API_KEY:
|
60 |
raise gr.Error("❌ OpenAI API key not found. Please set OPENAI_API_KEY as a Space Secret.")
|
@@ -63,9 +65,13 @@ def transcribe_audio_and_chat(audio_tuple, history): # Removed api_key from func
|
|
63 |
if history is None:
|
64 |
history = []
|
65 |
|
|
|
|
|
|
|
66 |
if audio_tuple is None:
|
67 |
# If no audio, raise a Gradio Error directly instead of adding to chat history
|
68 |
-
|
|
|
69 |
|
70 |
samplerate, audio_np_array = audio_tuple
|
71 |
|
@@ -82,46 +88,85 @@ def transcribe_audio_and_chat(audio_tuple, history): # Removed api_key from func
|
|
82 |
with sr.AudioFile(wav_byte_io) as source:
|
83 |
audio_data = r.record(source) # read the entire audio file
|
84 |
|
85 |
-
#
|
86 |
try:
|
87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
print(f"Transcribed User Input: {user_input}") # For debugging purposes
|
89 |
|
90 |
except sr.UnknownValueError:
|
91 |
history.append({"role": "assistant", "content": "Sorry, I could not understand the audio. Please try again."})
|
92 |
-
return history, history, None #
|
93 |
except sr.RequestError as e:
|
94 |
-
history.append({"role": "assistant", "content": f"Could not request results from
|
95 |
-
return history, history, None #
|
96 |
|
97 |
-
# ---
|
98 |
-
# Use the global OPENAI_API_KEY
|
99 |
client = OpenAI(api_key=OPENAI_API_KEY)
|
100 |
|
101 |
-
# Create the full messages list for OpenAI, starting with the system prompt
|
102 |
messages_for_openai = [{"role": "system", "content": system_prompt}] + history
|
103 |
messages_for_openai.append({"role": "user", "content": user_input})
|
104 |
|
105 |
-
# Get response from OpenAI using the specified model
|
106 |
response = client.chat.completions.create(
|
107 |
-
model=
|
108 |
-
messages=messages_for_openai,
|
109 |
temperature=0.7
|
110 |
)
|
111 |
|
112 |
bot_reply = response.choices[0].message.content
|
113 |
|
114 |
-
# Append both the user input and bot reply to the *Gradio* history (state)
|
115 |
history.append({"role": "user", "content": user_input})
|
116 |
history.append({"role": "assistant", "content": bot_reply})
|
117 |
|
118 |
-
#
|
119 |
-
|
120 |
-
|
121 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
|
123 |
except Exception as e:
|
124 |
-
print(f"An unexpected error occurred: {e}")
|
|
|
125 |
raise gr.Error(f"❌ An unexpected error occurred: {str(e)}")
|
126 |
|
127 |
|
@@ -154,7 +199,7 @@ with gr.Blocks(title="Voice Bot: Krishnavamshi Thumma") as demo:
|
|
154 |
#audioInputComponent {
|
155 |
margin-top: 20px;
|
156 |
}
|
157 |
-
.key-status { /*
|
158 |
padding: 5px;
|
159 |
margin-top: 5px;
|
160 |
border-radius: 4px;
|
@@ -170,10 +215,7 @@ with gr.Blocks(title="Voice Bot: Krishnavamshi Thumma") as demo:
|
|
170 |
</style>
|
171 |
""")
|
172 |
|
173 |
-
#
|
174 |
-
# api_key = gr.Textbox(...)
|
175 |
-
# key_status = gr.HTML(...)
|
176 |
-
|
177 |
# Chatbot component to display messages
|
178 |
chatbot = gr.Chatbot(elem_id="chatBox", type="messages", height=400)
|
179 |
# State component to maintain chat history in OpenAI's message format
|
@@ -188,25 +230,40 @@ with gr.Blocks(title="Voice Bot: Krishnavamshi Thumma") as demo:
|
|
188 |
streaming=False # Process audio after full recording
|
189 |
)
|
190 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
191 |
clear_btn = gr.Button("🗑️ Clear Chat")
|
192 |
|
193 |
-
# Event handler
|
194 |
audio_input.change(
|
195 |
fn=transcribe_audio_and_chat,
|
196 |
-
inputs=[audio_input, state], #
|
197 |
-
# Outputs: 1. chatbot display, 2. state (updated history),
|
198 |
-
|
|
|
199 |
)
|
200 |
|
201 |
-
#
|
202 |
gr.HTML("""
|
203 |
<script>
|
204 |
-
// No specific API key JS needed anymore as it's handled by secrets
|
205 |
// You can add other useful JS here if needed in the future
|
206 |
</script>
|
207 |
""")
|
208 |
|
209 |
# Clear button functionality: resets chatbot and state to empty
|
210 |
-
|
|
|
211 |
|
212 |
demo.launch()
|
|
|
3 |
import speech_recognition as sr
|
4 |
import os
|
5 |
import io # For in-memory file handling
|
6 |
+
import scipy.io.wavfile as wavfile # For writing/reading WAV data to/from in-memory file
|
7 |
import numpy as np # To handle the audio array
|
8 |
+
import datetime # For logging timestamps (not directly used in this version)
|
9 |
|
10 |
# --- Fetch API Key from Environment Variable ---
|
11 |
# This is the SECURE way to handle API keys in Hugging Face Spaces.
|
12 |
# You MUST set an environment variable named OPENAI_API_KEY in your Space's settings.
|
13 |
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
|
14 |
|
15 |
+
# --- Define the OpenAI Models to use ---
|
16 |
+
OPENAI_STT_MODEL = "whisper-1" # Using Whisper for Speech-to-Text
|
17 |
+
OPENAI_CHAT_MODEL = "gpt-3.5-turbo" # Using GPT-3.5 Turbo for chat responses
|
18 |
+
OPENAI_TTS_MODEL = "tts-1" # Using TTS-1 for Text-to-Speech
|
19 |
|
20 |
system_prompt = """
|
21 |
You are a sophisticated AI voice bot representing Krishnavamshi Thumma. Your persona should be that of a highly skilled, professional, and engaging Generative AI and Data Engineering enthusiast. When responding to questions, embody the following detailed professional identity:
|
|
|
24 |
You possess 1.5+ years of hands-on experience in data pipelines, automation, and scalable solutions. Your expertise specifically extends to building cutting-edge Generative AI products, utilizing advanced techniques like Large Language Models (LLMs), Retrieval-Augmented Generation (RAG) pipelines, various vector databases, and deep learning models. You are known for your proven ability to take full ownership, driving end-to-end AI product development from initial concept through to successful deployment. At your core, you are passionate about leveraging the intersection of AI and software engineering to solve real-world problems.
|
25 |
|
26 |
**Current Role & Key Contributions (Wishkarma):**
|
27 |
+
Currently, you are serving as a Data Engineer at Wishkarma in Hyderabad, India, a role you've held since May 2024. In this position, you have been instrumental in designing and optimizing scalable ETL pipelines primarily using Python and MongoDB, efficiently processing over 10,000 records daily while maintaining an impressive 99.9% data accuracy. You've developed and automated crucial data workflows utilizing Apache Airflow and AWS Lambda, which has significantly reduced manual intervention by 30% and boosted pipeline efficiency by 40%. A notable achievement includes leading the creation of a data refresh system based on source URLs, which streamlined product updates and saving over 20 hours per month. Furthermore, you implemented an innovative image-based product similarity search engine, leveraging CLIP-ViT-L/14, MongoDB Vector Search, and AWS S3. This initiative remarkably increased product discoverability by 35% and cut manual tagging efforts by 50%.
|
28 |
|
29 |
**Previous Experience (DeepThought Growth Management System):**
|
30 |
Prior to Wishkarma, you gained valuable experience as a Data Engineer Intern at DeepThought Growth Management System in Hyderabad, from November 2023 to June 2024. Here, you successfully processed more than 700 data records using MongoDB aggregations, ensuring 100% data integrity. Beyond technical tasks, you actively contributed to community and education by conducting over 50 technical workshops focused on data-driven decision-making, increasing engagement by 30%. You also mentored more than 400 students in crucial problem-solving frameworks like Design Thinking and MVP, which led to a 40% improvement in project completion rates.
|
|
|
56 |
r = sr.Recognizer()
|
57 |
|
58 |
# Modified function to accept audio as a numpy array and samplerate
|
59 |
+
def transcribe_audio_and_chat(audio_tuple, history):
|
60 |
# Check if API key is available in environment
|
61 |
if not OPENAI_API_KEY:
|
62 |
raise gr.Error("❌ OpenAI API key not found. Please set OPENAI_API_KEY as a Space Secret.")
|
|
|
65 |
if history is None:
|
66 |
history = []
|
67 |
|
68 |
+
# Initialize tts_audio_output to None, so we always return it
|
69 |
+
tts_audio_output = None
|
70 |
+
|
71 |
if audio_tuple is None:
|
72 |
# If no audio, raise a Gradio Error directly instead of adding to chat history
|
73 |
+
# Return history, history, None, None to clear inputs/outputs appropriately
|
74 |
+
return history, history, None, None
|
75 |
|
76 |
samplerate, audio_np_array = audio_tuple
|
77 |
|
|
|
88 |
with sr.AudioFile(wav_byte_io) as source:
|
89 |
audio_data = r.record(source) # read the entire audio file
|
90 |
|
91 |
+
# --- Speech-to-Text (STT) ---
|
92 |
try:
|
93 |
+
# Using OpenAI's Whisper model for STT
|
94 |
+
client = OpenAI(api_key=OPENAI_API_KEY)
|
95 |
+
# OpenAI's Whisper API typically expects audio in certain formats.
|
96 |
+
# While speech_recognition handles BytesIO, OpenAI's client.audio.transcriptions.create
|
97 |
+
# might prefer a direct file-like object or a path.
|
98 |
+
# For simplicity with BytesIO, we'll try to use speech_recognition's built-in recognizer.
|
99 |
+
# If you want to use OpenAI's ASR directly (e.g., Whisper), you'd need to adapt.
|
100 |
+
# For this code, we're sticking with `recognize_google` which uses Google's API by default.
|
101 |
+
user_input = r.recognize_google(audio_data) # This uses Google's STT (free tier usually)
|
102 |
+
|
103 |
+
# If you wanted to use OpenAI's Whisper ASR here, you'd do:
|
104 |
+
# audio_file_for_whisper = io.BytesIO(wav_byte_io.getvalue()) # Reset stream for Whisper
|
105 |
+
# audio_file_for_whisper.name = "audio.wav" # Whisper API needs a filename for BytesIO
|
106 |
+
# transcript = client.audio.transcriptions.create(
|
107 |
+
# model=OPENAI_STT_MODEL, # "whisper-1"
|
108 |
+
# file=audio_file_for_whisper
|
109 |
+
# )
|
110 |
+
# user_input = transcript.text
|
111 |
+
|
112 |
print(f"Transcribed User Input: {user_input}") # For debugging purposes
|
113 |
|
114 |
except sr.UnknownValueError:
|
115 |
history.append({"role": "assistant", "content": "Sorry, I could not understand the audio. Please try again."})
|
116 |
+
return history, history, None, tts_audio_output # Still clear inputs/outputs
|
117 |
except sr.RequestError as e:
|
118 |
+
history.append({"role": "assistant", "content": f"Could not request results from Speech Recognition service; {e}"})
|
119 |
+
return history, history, None, tts_audio_output # Still clear inputs/outputs
|
120 |
|
121 |
+
# --- Chat Completion ---
|
|
|
122 |
client = OpenAI(api_key=OPENAI_API_KEY)
|
123 |
|
|
|
124 |
messages_for_openai = [{"role": "system", "content": system_prompt}] + history
|
125 |
messages_for_openai.append({"role": "user", "content": user_input})
|
126 |
|
|
|
127 |
response = client.chat.completions.create(
|
128 |
+
model=OPENAI_CHAT_MODEL,
|
129 |
+
messages=messages_for_openai,
|
130 |
temperature=0.7
|
131 |
)
|
132 |
|
133 |
bot_reply = response.choices[0].message.content
|
134 |
|
|
|
135 |
history.append({"role": "user", "content": user_input})
|
136 |
history.append({"role": "assistant", "content": bot_reply})
|
137 |
|
138 |
+
# --- Text-to-Speech (TTS) ---
|
139 |
+
try:
|
140 |
+
tts_response = client.audio.speech.create(
|
141 |
+
model=OPENAI_TTS_MODEL, # "tts-1"
|
142 |
+
voice="alloy", # You can choose from "alloy", "echo", "fable", "onyx", "nova", "shimmer"
|
143 |
+
input=bot_reply,
|
144 |
+
response_format="wav" # Request WAV format for easy in-memory processing
|
145 |
+
)
|
146 |
+
|
147 |
+
# Read the audio stream into a BytesIO object
|
148 |
+
tts_audio_bytes = io.BytesIO()
|
149 |
+
for chunk in tts_response.iter_bytes(chunk_size=4096):
|
150 |
+
tts_audio_bytes.write(chunk)
|
151 |
+
tts_audio_bytes.seek(0) # Rewind for reading
|
152 |
+
|
153 |
+
# Read the WAV data using scipy
|
154 |
+
tts_samplerate, tts_numpy_array = wavfile.read(tts_audio_bytes)
|
155 |
+
tts_audio_output = (tts_samplerate, tts_numpy_array) # Format for gr.Audio(type="numpy") output
|
156 |
+
|
157 |
+
except Exception as tts_e:
|
158 |
+
print(f"Error generating TTS: {tts_e}")
|
159 |
+
# If TTS fails, log the error but don't stop the chat.
|
160 |
+
# The TTS audio output will just be None.
|
161 |
+
tts_audio_output = None # Ensure it's None if there's an error
|
162 |
+
history.append({"role": "assistant", "content": "(Voice generation failed.)"}) # Optional: notify user
|
163 |
+
|
164 |
+
# Return all required outputs: chatbot history, state history, cleared audio input, TTS audio
|
165 |
+
return history, history, None, tts_audio_output
|
166 |
|
167 |
except Exception as e:
|
168 |
+
print(f"An unexpected error occurred: {e}")
|
169 |
+
# Ensure all outputs are returned even on a general error
|
170 |
raise gr.Error(f"❌ An unexpected error occurred: {str(e)}")
|
171 |
|
172 |
|
|
|
199 |
#audioInputComponent {
|
200 |
margin-top: 20px;
|
201 |
}
|
202 |
+
.key-status { /* Not strictly needed anymore but keeping for style consistency if other status messages arise */
|
203 |
padding: 5px;
|
204 |
margin-top: 5px;
|
205 |
border-radius: 4px;
|
|
|
215 |
</style>
|
216 |
""")
|
217 |
|
218 |
+
# --- UI Components ---
|
|
|
|
|
|
|
219 |
# Chatbot component to display messages
|
220 |
chatbot = gr.Chatbot(elem_id="chatBox", type="messages", height=400)
|
221 |
# State component to maintain chat history in OpenAI's message format
|
|
|
230 |
streaming=False # Process audio after full recording
|
231 |
)
|
232 |
|
233 |
+
# New: Audio output component for TTS playback
|
234 |
+
tts_audio_output = gr.Audio(
|
235 |
+
label="Bot's Voice Response",
|
236 |
+
type="numpy", # Expects (samplerate, numpy_array) for playback
|
237 |
+
autoplay=True, # Automatically play the audio
|
238 |
+
waveform_options={
|
239 |
+
"skip_length_milliseconds": 0,
|
240 |
+
"cursor_color": "#000000",
|
241 |
+
"wave_color": "#2196F3",
|
242 |
+
"wave_progress_color": "#4CAF50",
|
243 |
+
"unfilled_waveform_color": "#E0E0E0"
|
244 |
+
}
|
245 |
+
)
|
246 |
+
|
247 |
clear_btn = gr.Button("🗑️ Clear Chat")
|
248 |
|
249 |
+
# Event handler for audio input change
|
250 |
audio_input.change(
|
251 |
fn=transcribe_audio_and_chat,
|
252 |
+
inputs=[audio_input, state], # api_key is now global
|
253 |
+
# Outputs: 1. chatbot display, 2. state (updated history),
|
254 |
+
# 3. audio_input (to clear it), 4. tts_audio_output (for playing bot's voice)
|
255 |
+
outputs=[chatbot, state, audio_input, tts_audio_output]
|
256 |
)
|
257 |
|
258 |
+
# JavaScript (no changes needed for API key part here as it's removed)
|
259 |
gr.HTML("""
|
260 |
<script>
|
|
|
261 |
// You can add other useful JS here if needed in the future
|
262 |
</script>
|
263 |
""")
|
264 |
|
265 |
# Clear button functionality: resets chatbot and state to empty
|
266 |
+
# Also clear the TTS audio output when chat is cleared
|
267 |
+
clear_btn.click(lambda: ([], [], None), None, [chatbot, state, tts_audio_output])
|
268 |
|
269 |
demo.launch()
|