Spaces:

gmustafa413
/

ChatBot

Sleeping

App Files Files Community

ChatBot / app.py

gmustafa413

Update app.py

43c6974 verified 2 months ago

raw

history blame contribute delete

12.8 kB

	import faiss
	import numpy as np
	import gradio as gr
	import requests
	import json
	import re
	import torch
	from transformers import AutoTokenizer
	from langdetect import detect
	from sentence_transformers import SentenceTransformer
	from concurrent.futures import ThreadPoolExecutor
	from tqdm import tqdm

	# Configuration
	GROQ_API_KEY = "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" # Replace with your actual key
	MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
	DATASET_URL = "https://huggingface.co/datasets/midrees2806/7K_Dataset/resolve/main/University_of_Education_Lahore_FAQ.json"
	CHUNK_SIZE = 512
	MAX_TOKENS = 4096
	WORKERS = 4
	EMBEDDING_BATCH_SIZE = 32

	# Load the embedding model
	model = SentenceTransformer(MODEL_NAME)
	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

	class UniversityKnowledgeBase:
	def __init__(self):
	self.index = faiss.IndexFlatL2(model.get_sentence_embedding_dimension())
	self.chunks = []
	self.loaded = False
	self.total_chunks = 0

	def load_dataset(self):
	"""Loads and thoroughly processes the University dataset"""
	try:
	print("\n" + "="*50)
	print("Loading University of Education, Lahore dataset...")
	print("="*50 + "\n")

	# Fetch dataset with error handling
	response = requests.get(DATASET_URL, timeout=30)
	if response.status_code != 200:
	raise Exception(f"Failed to fetch dataset. HTTP Status: {response.status_code}")

	# Parse JSON content
	try:
	data = response.json()
	except json.JSONDecodeError:
	raise Exception("Invalid JSON format in dataset")

	if not isinstance(data, list):
	raise Exception("Dataset format is invalid. Expected a list of Q&A pairs.")

	# Process all content with progress tracking
	self.chunks = []
	with tqdm(data, desc="Processing dataset") as progress_bar:
	for item in progress_bar:
	if isinstance(item, dict):
	if 'question' in item and 'answer' in item:
	# Create comprehensive Q&A chunks
	self.chunks.append(f"QUESTION: {item['question'].strip()}\nANSWER: {item['answer'].strip()}\n")
	elif 'text' in item:
	# Process text content with semantic chunking
	text = item['text'].strip()
	if len(text) > CHUNK_SIZE:
	sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.\|\?)\s', text)
	current_chunk = ""
	for sentence in sentences:
	if len(current_chunk) + len(sentence) < CHUNK_SIZE:
	current_chunk += " " + sentence
	else:
	if current_chunk:
	self.chunks.append(current_chunk.strip())
	current_chunk = sentence
	if current_chunk:
	self.chunks.append(current_chunk.strip())
	else:
	self.chunks.append(text)

	self.total_chunks = len(self.chunks)
	if self.total_chunks == 0:
	raise Exception("No valid content found in the dataset")

	print(f"\nSuccessfully processed {self.total_chunks} knowledge chunks from dataset")

	# Generate embeddings in batches with progress tracking
	print("\nGenerating embeddings...")
	embeddings = []
	for i in tqdm(range(0, self.total_chunks, EMBEDDING_BATCH_SIZE),
	desc="Creating embeddings",
	total=(self.total_chunks//EMBEDDING_BATCH_SIZE)+1):
	batch = self.chunks[i:i+EMBEDDING_BATCH_SIZE]
	batch_embeddings = model.encode(
	batch,
	convert_to_tensor=True,
	show_progress_bar=False
	).cpu().numpy().astype('float32')
	embeddings.append(batch_embeddings)

	# Combine all embeddings and build FAISS index
	all_embeddings = np.concatenate(embeddings)
	self.index.add(all_embeddings)
	self.loaded = True

	return f"✅ Successfully loaded {self.total_chunks} knowledge chunks from University dataset"

	except Exception as e:
	import traceback
	traceback.print_exc()
	return f"❌ Error loading dataset: {str(e)}"

	def find_relevant_context(self, query, k=5):
	"""Finds the most relevant context with enhanced retrieval"""
	if not self.loaded or not self.chunks:
	return None

	try:
	# Generate query embedding
	query_embedding = model.encode([query], convert_to_tensor=True).cpu().numpy().astype('float32')

	# Search with higher k initially for better context
	_, indices = self.index.search(query_embedding, k*2)

	# Get unique chunks (avoid duplicates)
	unique_indices = list(dict.fromkeys(indices[0]))

	# Select top-k most relevant unique chunks
	selected_chunks = []
	for idx in unique_indices[:k]:
	if 0 <= idx < len(self.chunks):
	selected_chunks.append(self.chunks[idx])

	return "\n\n---\n\n".join(selected_chunks) if selected_chunks else None
	except Exception as e:
	print(f"Context retrieval error: {str(e)}")
	return None

	# Initialize the knowledge base
	knowledge_base = UniversityKnowledgeBase()

	def detect_language(text):
	"""Enhanced language detection with Urdu support"""
	try:
	text = text.lower().strip()

	# Roman Urdu detection
	roman_urdu_keywords = ['hai', 'ho', 'hain', 'ka', 'ki', 'ke', 'main', 'tum', 'ap', 'kyun', 'kya']
	if any(keyword in text for keyword in roman_urdu_keywords):
	return "Roman Urdu"

	# Standard detection
	lang = detect(text)
	if lang == "ur":
	return "Urdu"
	elif lang == "hi": # Hindi/Urdu handling
	return "Urdu" if not text.isascii() else "Roman Urdu"
	return "English"
	except:
	return "English"

	def get_groq_response(context, user_query, language="English"):
	"""Generates accurate responses strictly based on context"""
	headers = {
	"Authorization": f"Bearer {GROQ_API_KEY}",
	"Content-Type": "application/json"
	}

	# Language-specific system prompts
	system_prompts = {
	"Urdu": """
	آپ یونیورسٹی آف ایجوکیشن، لاہور کا سرکاری چیٹ بوٹ ہیں۔ درج ذیل معلومات کی بنیاد پر درست جواب دیں۔
	اگر جواب دستیاب نہ ہو تو کہیں:
	"معذرت، یہ معلومات دستیاب نہیں۔ براہ کرم یونیورسٹی کی ویب سائٹ دیکھیں۔"
	""",
	"Roman Urdu": """
	Aap University of Education, Lahore ka chatbot hain. Diye gaye context ke hisab se jawab dein.
	Agar jawab nahin mila to kehain:
	"Maazrat, yeh maloomat mojood nahin. University ki website check karein."
	""",
	"English": """
	You are the official chatbot of University of Education, Lahore.
	Answer STRICTLY based on the provided context. If the answer isn't available, say:
	"I'm sorry, this information isn't available. Please check the university website."
	"""
	}

	payload = {
	"model": "mixtral-8x7b-32768",
	"messages": [
	{"role": "system", "content": system_prompts.get(language, system_prompts["English"])},
	{"role": "user", "content": f"University Context:\n{context}\n\nQuestion: {user_query}"}
	],
	"temperature": 0.1, # Low temperature for factual accuracy
	"max_tokens": MAX_TOKENS,
	"top_p": 0.9
	}

	try:
	response = requests.post(
	"https://api.groq.com/openai/v1/chat/completions",
	headers=headers,
	json=payload,
	timeout=30
	)

	if response.status_code != 200:
	print(f"API Error {response.status_code}: {response.text[:200]}")
	return None

	return response.json().get("choices", [{}])[0].get("message", {}).get("content", "")
	except Exception as e:
	print(f"API Request Failed: {str(e)}")
	return None

	def chatbot_response(user_input, chat_history):
	"""Handles user queries with comprehensive response generation"""
	if not user_input.strip():
	return chat_history + [(user_input, "Please enter a valid question.")]

	# Detect language
	language = detect_language(user_input)

	# Retrieve relevant context (more chunks for better accuracy)
	context = knowledge_base.find_relevant_context(user_input, k=5)

	# Handle no context found
	if not context:
	error_messages = {
	"Urdu": "معذرت، یہ معلومات دستیاب نہیں۔ براہ کرم یونیورسٹی کی ویب سائٹ دیکھیں۔",
	"Roman Urdu": "Maazrat, yeh maloomat mojood nahin. University ki website check karein.",
	"English": "I'm sorry, this information isn't available. Please check the university website."
	}
	return chat_history + [(user_input, error_messages.get(language, error_messages["English"]))]

	# Generate response
	response = get_groq_response(context, user_input, language)

	# Fallback if API fails
	if not response:
	fallback_messages = {
	"Urdu": "معذرت، نظام میں عارضی خرابی ہے۔ بعد میں کوشش کریں۔",
	"Roman Urdu": "Maazrat, system mein masla hai. Baad mein koshish karein.",
	"English": "Sorry, there's a temporary system issue. Please try again later."
	}
	response = fallback_messages.get(language, fallback_messages["English"])

	return chat_history + [(user_input, response)]

	# Gradio Interface
	with gr.Blocks(title="University of Education ChatBot", theme=gr.themes.Soft()) as app:
	gr.Markdown("""
	<div style='text-align: center;'>
	<h1>University of Education, Lahore</h1>
	<h2>Official Information ChatBot</h2>
	<p>Ask any question about the university in English, Urdu, or Roman Urdu</p>
	</div>
	""")

	# Initialize dataset
	load_status = knowledge_base.load_dataset()

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### Knowledge Base Status")
	status = gr.Textbox(
	label="Dataset Status",
	value=load_status,
	interactive=False,
	lines=2
	)
	reload_btn = gr.Button("🔄 Reload Knowledge Base", variant="secondary")

	gr.Markdown("""
	Note: This chatbot answers strictly based on the official University of Education, Lahore dataset.
	""")

	with gr.Column(scale=2):
	chatbot = gr.Chatbot(
	height=500,
	label="Conversation History",
	bubble_full_width=False
	)
	question = gr.Textbox(
	label="Your Question",
	placeholder="Type your question about the university...",
	lines=2,
	max_lines=5
	)
	with gr.Row():
	ask_btn = gr.Button("Ask Question", variant="primary")
	clear_btn = gr.Button("Clear Conversation", variant="secondary")

	# Event handlers
	reload_btn.click(
	fn=lambda: knowledge_base.load_dataset(),
	inputs=None,
	outputs=status,
	queue=False
	)

	ask_btn.click(
	fn=chatbot_response,
	inputs=[question, chatbot],
	outputs=chatbot,
	queue=True
	).then(lambda: "", None, question)

	clear_btn.click(
	fn=lambda: [],
	inputs=None,
	outputs=chatbot,
	queue=False
	)

	# Launch the application
	if __name__ == "__main__":
	app.launch(server_name="0.0.0.0", server_port=7860)