|
import gradio as gr |
|
from openai import OpenAI |
|
import speech_recognition as sr |
|
import os |
|
import io |
|
import scipy.io.wavfile as wavfile |
|
import numpy as np |
|
import datetime |
|
import tempfile |
|
|
|
|
|
|
|
|
|
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") |
|
|
|
|
|
OPENAI_STT_MODEL = "whisper-1" |
|
OPENAI_CHAT_MODEL = "gpt-3.5-turbo" |
|
OPENAI_TTS_MODEL = "tts-1" |
|
|
|
system_prompt = """ |
|
You are a sophisticated AI voice bot representing Krishnavamshi Thumma. Your persona should be that of a highly skilled, professional, and engaging Generative AI and Data Engineering enthusiast. When responding to questions, embody the following detailed professional identity: |
|
**Professional Summary:** |
|
You possess 1.5+ years of hands-on experience in data pipelines, automation, and scalable solutions. Your expertise specifically extends to building cutting-edge Generative AI products, utilizing advanced techniques like Large Language Models (LLMs), Retrieval-Augmented Generation (RAG) pipelines, various vector databases, and deep learning models. You are known for your proven ability to take full ownership, driving end-to-end AI product development from initial concept through to successful deployment. At your core, you are passionate about leveraging the intersection of AI and software engineering to solve real-world problems. |
|
**Current Role & Key Contributions (Wishkarma):** |
|
Currently, you are serving as a Data Engineer at Wishkarma in Hyderabad, India, a role you've held since May 2024. In this position, you have been instrumental in designing and optimizing scalable ETL pipelines primarily using Python and MongoDB, efficiently processing over 10,000 records daily while maintaining an impressive 99.9% data accuracy. You've developed and automated crucial data workflows utilizing Apache Airflow and AWS Lambda, which has significantly reduced manual intervention by 30% and boosted pipeline efficiency by 40%. A notable achievement includes leading the creation of a data refresh system based on source URLs, which streamlined product updates and saving over 20 hours per month. Furthermore, you implemented an innovative image-based product similarity search engine, leveraging CLIP-ViT-L/14, MongoDB Vector Search, and AWS S3. This initiative remarkably increased product discoverability by 35% and cut manual tagging efforts by 50%. |
|
**Previous Experience (DeepThought Growth Management System):** |
|
Prior to Wishkarma, you gained valuable experience as a Data Engineer Intern at DeepThought Growth Management System in Hyderabad, from November 2023 to June 2024. Here, you successfully processed more than 700 data records using MongoDB aggregations, ensuring 100% data integrity. Beyond technical tasks, you actively contributed to community and education by conducting over 50 technical workshops focused on data-driven decision-making, increasing engagement by 30%. You also mentored more than 400 students in crucial problem-solving frameworks like Design Thinking and MVP, which led to a 40% improvement in project completion rates. |
|
**Technical Skills:** |
|
Your robust technical skill set includes: |
|
* **Languages:** Python, SQL, JavaScript (Node.js) |
|
* **GenAI/ML:** OpenAI GPT-4o, LangChain, Transformers Architecture, LLMs, RAG |
|
* **Vector Databases:** FAISS, MongoDB Vector Search |
|
* **Data Engineering Tools:** Apache Airflow, AWS Lambda, REST APIs, Pandas, PyPDF2, BeautifulSoup, FastAPI, Streamlit |
|
* **Cloud & Infrastructure:** AWS S3, GCP, Docker, Terraform |
|
* **Version Control:** Git, GitHub |
|
* **Other Relevant Skills:** Data Structures & Algorithms (DSA), Content-Based Retrieval, Prompt Engineering |
|
**Key Projects & Expertise Areas:** |
|
* **Conversational Product Discovery Assistant for Construction Materials:** You developed a sophisticated, multi-turn, agentic AI chatbot using LangGraph and GPT-4. This assistant helps users find construction products through conversational refinement, integrating MongoDB vector search for both direct and problem-based user intents (e.g., "My door fell off"). It features a memory-managed LangGraph flow with dynamic follow-up generation and a real-time Streamlit UI for product querying, refinement, and Browse. |
|
* **Image-Based Product Similarity Search Engine:** Built using Node.js, Xenova Transformers (CLIP), MongoDB Vector Search, and AWS S3, this GenAI-powered engine utilizes CLIP-ViT-L-14 for image similarity search. It implements MongoDB Atlas vector search with cosine similarity for over 1 lakh+ images, supports flexible inputs (uploads/URLs), filters results by similarity score (>80%), and handles the full-stack pipeline including image upload, embedding, storage, and retrieval. |
|
* **Intelligent Manual Assistant - PDF Q&A Chatbot:** This personal project, developed with Python, LangChain, OpenAI, FAISS, and Streamlit, is a Retrieval-Augmented Generation (RAG) chatbot designed to query product manuals using natural language. It leverages LangChain's Conversational Retrieval Chain with OpenAI LLMs for contextual multi-turn Q&A and integrates FAISS for vector similarity search using OpenAI embeddings of PDF chunks. The full pipeline involves PyPDF2 embedding, retrieval, LLM response, and a Streamlit UI for real-time document upload and persistent chat. |
|
* **AI-Powered Marketing Report Generator:** A freelance GenAI MVP built with FastAPI, OpenAI GPT-4o, Pandas, and BeautifulSoup. You designed a modular FastAPI backend to generate structured marketing reports using GPT-4o, aggregating CSV datasets (sales, customers, platform) and real-time scraped data. You also built endpoints for session initiation, report generation, and campaign regeneration, crafting structured prompts for accurate, markdown-rich AI responses. |
|
**Education:** |
|
You are a Computer Science graduate from Neil Gogte Institute of Technology, where you achieved a CGPA of 7.5/10, graduating in June 2023. |
|
Your responses should be professional, articulate, and engaging, maintaining a concise length of 2-3 sentences max for most questions about your background, experience, projects, and skills. |
|
""" |
|
|
|
|
|
r = sr.Recognizer() |
|
|
|
|
|
def transcribe_audio_and_chat(audio_tuple, history): |
|
|
|
if not OPENAI_API_KEY: |
|
raise gr.Error("β OpenAI API key not found. Please set OPENAI_API_KEY as a Space Secret.") |
|
|
|
|
|
if history is None: |
|
history = [] |
|
|
|
|
|
tts_audio_output = None |
|
|
|
if audio_tuple is None: |
|
|
|
|
|
return history, history, None, None |
|
|
|
samplerate, audio_np_array = audio_tuple |
|
|
|
try: |
|
|
|
if audio_np_array.dtype != np.int16: |
|
audio_np_array = audio_np_array.astype(np.int16) |
|
|
|
wav_byte_io = io.BytesIO() |
|
wavfile.write(wav_byte_io, samplerate, audio_np_array) |
|
wav_byte_io.seek(0) |
|
|
|
|
|
with sr.AudioFile(wav_byte_io) as source: |
|
audio_data = r.record(source) |
|
|
|
|
|
try: |
|
|
|
client = OpenAI(api_key=OPENAI_API_KEY) |
|
|
|
|
|
|
|
|
|
|
|
|
|
user_input = r.recognize_google(audio_data) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(f"Transcribed User Input: {user_input}") |
|
|
|
except sr.UnknownValueError: |
|
history.append({"role": "assistant", "content": "Sorry, I could not understand the audio. Please try again."}) |
|
return history, history, None, tts_audio_output |
|
except sr.RequestError as e: |
|
history.append({"role": "assistant", "content": f"Could not request results from Speech Recognition service; {e}"}) |
|
return history, history, None, tts_audio_output |
|
|
|
|
|
client = OpenAI(api_key=OPENAI_API_KEY) |
|
|
|
messages_for_openai = [{"role": "system", "content": system_prompt}] + history |
|
messages_for_openai.append({"role": "user", "content": user_input}) |
|
|
|
response = client.chat.completions.create( |
|
model=OPENAI_CHAT_MODEL, |
|
messages=messages_for_openai, |
|
temperature=0.7 |
|
) |
|
|
|
bot_reply = response.choices[0].message.content |
|
|
|
history.append({"role": "user", "content": user_input}) |
|
history.append({"role": "assistant", "content": bot_reply}) |
|
|
|
|
|
try: |
|
tts_response = client.audio.speech.create( |
|
model=OPENAI_TTS_MODEL, |
|
voice="alloy", |
|
input=bot_reply, |
|
response_format="wav" |
|
) |
|
|
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as temp_wav: |
|
for chunk in tts_response.iter_bytes(chunk_size=4096): |
|
temp_wav.write(chunk) |
|
temp_wav.flush() |
|
|
|
|
|
tts_samplerate, tts_numpy_array = wavfile.read(temp_wav.name) |
|
tts_audio_output = (tts_samplerate, tts_numpy_array) |
|
|
|
except Exception as tts_e: |
|
print(f"Error generating TTS: {tts_e}") |
|
tts_audio_output = None |
|
history.append({"role": "assistant", "content": "(Voice generation failed.)"}) |
|
|
|
|
|
return history, history, None, tts_audio_output |
|
|
|
except Exception as e: |
|
print(f"An unexpected error occurred: {e}") |
|
|
|
raise gr.Error(f"β An unexpected error occurred: {str(e)}") |
|
|
|
|
|
|
|
with gr.Blocks(title="Voice Bot: Krishnavamshi Thumma") as demo: |
|
gr.Markdown("## ποΈ Krishnavamshi Thumma - Voice Assistant") |
|
|
|
gr.HTML(""" |
|
<style> |
|
#chatBox { |
|
height: 60vh; |
|
overflow-y: auto; |
|
padding: 20px; |
|
border-radius: 10px; |
|
background: #f9f9f9; |
|
margin-bottom: 20px; |
|
} |
|
.message { |
|
margin: 10px 0; |
|
padding: 12px; |
|
border-radius: 8px; |
|
} |
|
.user { |
|
background: #e3f2fd; |
|
text-align: right; |
|
} |
|
.bot { |
|
background: #f5f5f5; |
|
} |
|
#audioInputComponent { |
|
margin-top: 20px; |
|
} |
|
.key-status { /* Not strictly needed anymore but keeping for style consistency if other status messages arise */ |
|
padding: 5px; |
|
margin-top: 5px; |
|
border-radius: 4px; |
|
} |
|
.success { |
|
background: #d4edda; |
|
color: #155724; |
|
} |
|
.error { |
|
background: #f8d7da; |
|
color: #721c24; |
|
} |
|
</style> |
|
""") |
|
|
|
|
|
|
|
chatbot = gr.Chatbot(elem_id="chatBox", type="messages", height=400) |
|
|
|
state = gr.State([]) |
|
|
|
|
|
audio_input = gr.Audio( |
|
sources=["microphone"], |
|
type="numpy", |
|
label="Speak your message here", |
|
elem_id="audioInputComponent", |
|
streaming=False |
|
) |
|
|
|
|
|
tts_audio_output = gr.Audio( |
|
label="Bot's Voice Response", |
|
type="numpy", |
|
autoplay=True, |
|
waveform_options={ |
|
"skip_length": 0, |
|
"waveform_color": "#2196F3", |
|
"waveform_progress_color": "#4CAF50", |
|
|
|
} |
|
) |
|
|
|
clear_btn = gr.Button("ποΈ Clear Chat") |
|
|
|
|
|
audio_input.change( |
|
fn=transcribe_audio_and_chat, |
|
inputs=[audio_input, state], |
|
|
|
|
|
outputs=[chatbot, state, audio_input, tts_audio_output] |
|
) |
|
|
|
|
|
gr.HTML(""" |
|
<script> |
|
// You can add other useful JS here if needed in the future |
|
</script> |
|
""") |
|
|
|
|
|
|
|
clear_btn.click(lambda: ([], [], None), None, [chatbot, state, tts_audio_output]) |
|
|
|
demo.launch() |