Spaces:
Sleeping
Sleeping
File size: 12,806 Bytes
43c6974 1970d42 43c6974 1970d42 43c6974 1970d42 43c6974 1970d42 aa522e4 43c6974 1970d42 43c6974 1970d42 43c6974 aa522e4 43c6974 1970d42 43c6974 1970d42 43c6974 1970d42 43c6974 1970d42 43c6974 1970d42 43c6974 1970d42 43c6974 1970d42 43c6974 1970d42 43c6974 aa522e4 43c6974 aa522e4 43c6974 aa522e4 43c6974 1970d42 43c6974 1970d42 43c6974 1970d42 43c6974 1970d42 43c6974 1970d42 43c6974 1970d42 43c6974 1970d42 43c6974 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 |
import faiss
import numpy as np
import gradio as gr
import requests
import json
import re
import torch
from transformers import AutoTokenizer
from langdetect import detect
from sentence_transformers import SentenceTransformer
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
# Configuration
GROQ_API_KEY = "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" # Replace with your actual key
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
DATASET_URL = "https://huggingface.co/datasets/midrees2806/7K_Dataset/resolve/main/University_of_Education_Lahore_FAQ.json"
CHUNK_SIZE = 512
MAX_TOKENS = 4096
WORKERS = 4
EMBEDDING_BATCH_SIZE = 32
# Load the embedding model
model = SentenceTransformer(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
class UniversityKnowledgeBase:
def __init__(self):
self.index = faiss.IndexFlatL2(model.get_sentence_embedding_dimension())
self.chunks = []
self.loaded = False
self.total_chunks = 0
def load_dataset(self):
"""Loads and thoroughly processes the University dataset"""
try:
print("\n" + "="*50)
print("Loading University of Education, Lahore dataset...")
print("="*50 + "\n")
# Fetch dataset with error handling
response = requests.get(DATASET_URL, timeout=30)
if response.status_code != 200:
raise Exception(f"Failed to fetch dataset. HTTP Status: {response.status_code}")
# Parse JSON content
try:
data = response.json()
except json.JSONDecodeError:
raise Exception("Invalid JSON format in dataset")
if not isinstance(data, list):
raise Exception("Dataset format is invalid. Expected a list of Q&A pairs.")
# Process all content with progress tracking
self.chunks = []
with tqdm(data, desc="Processing dataset") as progress_bar:
for item in progress_bar:
if isinstance(item, dict):
if 'question' in item and 'answer' in item:
# Create comprehensive Q&A chunks
self.chunks.append(f"QUESTION: {item['question'].strip()}\nANSWER: {item['answer'].strip()}\n")
elif 'text' in item:
# Process text content with semantic chunking
text = item['text'].strip()
if len(text) > CHUNK_SIZE:
sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
current_chunk = ""
for sentence in sentences:
if len(current_chunk) + len(sentence) < CHUNK_SIZE:
current_chunk += " " + sentence
else:
if current_chunk:
self.chunks.append(current_chunk.strip())
current_chunk = sentence
if current_chunk:
self.chunks.append(current_chunk.strip())
else:
self.chunks.append(text)
self.total_chunks = len(self.chunks)
if self.total_chunks == 0:
raise Exception("No valid content found in the dataset")
print(f"\nSuccessfully processed {self.total_chunks} knowledge chunks from dataset")
# Generate embeddings in batches with progress tracking
print("\nGenerating embeddings...")
embeddings = []
for i in tqdm(range(0, self.total_chunks, EMBEDDING_BATCH_SIZE),
desc="Creating embeddings",
total=(self.total_chunks//EMBEDDING_BATCH_SIZE)+1):
batch = self.chunks[i:i+EMBEDDING_BATCH_SIZE]
batch_embeddings = model.encode(
batch,
convert_to_tensor=True,
show_progress_bar=False
).cpu().numpy().astype('float32')
embeddings.append(batch_embeddings)
# Combine all embeddings and build FAISS index
all_embeddings = np.concatenate(embeddings)
self.index.add(all_embeddings)
self.loaded = True
return f"✅ Successfully loaded {self.total_chunks} knowledge chunks from University dataset"
except Exception as e:
import traceback
traceback.print_exc()
return f"❌ Error loading dataset: {str(e)}"
def find_relevant_context(self, query, k=5):
"""Finds the most relevant context with enhanced retrieval"""
if not self.loaded or not self.chunks:
return None
try:
# Generate query embedding
query_embedding = model.encode([query], convert_to_tensor=True).cpu().numpy().astype('float32')
# Search with higher k initially for better context
_, indices = self.index.search(query_embedding, k*2)
# Get unique chunks (avoid duplicates)
unique_indices = list(dict.fromkeys(indices[0]))
# Select top-k most relevant unique chunks
selected_chunks = []
for idx in unique_indices[:k]:
if 0 <= idx < len(self.chunks):
selected_chunks.append(self.chunks[idx])
return "\n\n---\n\n".join(selected_chunks) if selected_chunks else None
except Exception as e:
print(f"Context retrieval error: {str(e)}")
return None
# Initialize the knowledge base
knowledge_base = UniversityKnowledgeBase()
def detect_language(text):
"""Enhanced language detection with Urdu support"""
try:
text = text.lower().strip()
# Roman Urdu detection
roman_urdu_keywords = ['hai', 'ho', 'hain', 'ka', 'ki', 'ke', 'main', 'tum', 'ap', 'kyun', 'kya']
if any(keyword in text for keyword in roman_urdu_keywords):
return "Roman Urdu"
# Standard detection
lang = detect(text)
if lang == "ur":
return "Urdu"
elif lang == "hi": # Hindi/Urdu handling
return "Urdu" if not text.isascii() else "Roman Urdu"
return "English"
except:
return "English"
def get_groq_response(context, user_query, language="English"):
"""Generates accurate responses strictly based on context"""
headers = {
"Authorization": f"Bearer {GROQ_API_KEY}",
"Content-Type": "application/json"
}
# Language-specific system prompts
system_prompts = {
"Urdu": """
آپ یونیورسٹی آف ایجوکیشن، لاہور کا سرکاری چیٹ بوٹ ہیں۔ درج ذیل معلومات کی بنیاد پر درست جواب دیں۔
اگر جواب دستیاب نہ ہو تو کہیں:
"معذرت، یہ معلومات دستیاب نہیں۔ براہ کرم یونیورسٹی کی ویب سائٹ دیکھیں۔"
""",
"Roman Urdu": """
Aap University of Education, Lahore ka chatbot hain. Diye gaye context ke hisab se jawab dein.
Agar jawab nahin mila to kehain:
"Maazrat, yeh maloomat mojood nahin. University ki website check karein."
""",
"English": """
You are the official chatbot of University of Education, Lahore.
Answer STRICTLY based on the provided context. If the answer isn't available, say:
"I'm sorry, this information isn't available. Please check the university website."
"""
}
payload = {
"model": "mixtral-8x7b-32768",
"messages": [
{"role": "system", "content": system_prompts.get(language, system_prompts["English"])},
{"role": "user", "content": f"University Context:\n{context}\n\nQuestion: {user_query}"}
],
"temperature": 0.1, # Low temperature for factual accuracy
"max_tokens": MAX_TOKENS,
"top_p": 0.9
}
try:
response = requests.post(
"https://api.groq.com/openai/v1/chat/completions",
headers=headers,
json=payload,
timeout=30
)
if response.status_code != 200:
print(f"API Error {response.status_code}: {response.text[:200]}")
return None
return response.json().get("choices", [{}])[0].get("message", {}).get("content", "")
except Exception as e:
print(f"API Request Failed: {str(e)}")
return None
def chatbot_response(user_input, chat_history):
"""Handles user queries with comprehensive response generation"""
if not user_input.strip():
return chat_history + [(user_input, "Please enter a valid question.")]
# Detect language
language = detect_language(user_input)
# Retrieve relevant context (more chunks for better accuracy)
context = knowledge_base.find_relevant_context(user_input, k=5)
# Handle no context found
if not context:
error_messages = {
"Urdu": "معذرت، یہ معلومات دستیاب نہیں۔ براہ کرم یونیورسٹی کی ویب سائٹ دیکھیں۔",
"Roman Urdu": "Maazrat, yeh maloomat mojood nahin. University ki website check karein.",
"English": "I'm sorry, this information isn't available. Please check the university website."
}
return chat_history + [(user_input, error_messages.get(language, error_messages["English"]))]
# Generate response
response = get_groq_response(context, user_input, language)
# Fallback if API fails
if not response:
fallback_messages = {
"Urdu": "معذرت، نظام میں عارضی خرابی ہے۔ بعد میں کوشش کریں۔",
"Roman Urdu": "Maazrat, system mein masla hai. Baad mein koshish karein.",
"English": "Sorry, there's a temporary system issue. Please try again later."
}
response = fallback_messages.get(language, fallback_messages["English"])
return chat_history + [(user_input, response)]
# Gradio Interface
with gr.Blocks(title="University of Education ChatBot", theme=gr.themes.Soft()) as app:
gr.Markdown("""
<div style='text-align: center;'>
<h1>University of Education, Lahore</h1>
<h2>Official Information ChatBot</h2>
<p>Ask any question about the university in English, Urdu, or Roman Urdu</p>
</div>
""")
# Initialize dataset
load_status = knowledge_base.load_dataset()
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### Knowledge Base Status")
status = gr.Textbox(
label="Dataset Status",
value=load_status,
interactive=False,
lines=2
)
reload_btn = gr.Button("🔄 Reload Knowledge Base", variant="secondary")
gr.Markdown("""
**Note:** This chatbot answers strictly based on the official University of Education, Lahore dataset.
""")
with gr.Column(scale=2):
chatbot = gr.Chatbot(
height=500,
label="Conversation History",
bubble_full_width=False
)
question = gr.Textbox(
label="Your Question",
placeholder="Type your question about the university...",
lines=2,
max_lines=5
)
with gr.Row():
ask_btn = gr.Button("Ask Question", variant="primary")
clear_btn = gr.Button("Clear Conversation", variant="secondary")
# Event handlers
reload_btn.click(
fn=lambda: knowledge_base.load_dataset(),
inputs=None,
outputs=status,
queue=False
)
ask_btn.click(
fn=chatbot_response,
inputs=[question, chatbot],
outputs=chatbot,
queue=True
).then(lambda: "", None, question)
clear_btn.click(
fn=lambda: [],
inputs=None,
outputs=chatbot,
queue=False
)
# Launch the application
if __name__ == "__main__":
app.launch(server_name="0.0.0.0", server_port=7860) |