import faiss import numpy as np import gradio as gr import requests import json import re import torch from transformers import AutoTokenizer from langdetect import detect from sentence_transformers import SentenceTransformer from concurrent.futures import ThreadPoolExecutor from tqdm import tqdm # Configuration GROQ_API_KEY = "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" # Replace with your actual key MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" DATASET_URL = "https://huggingface.co/datasets/midrees2806/7K_Dataset/resolve/main/University_of_Education_Lahore_FAQ.json" CHUNK_SIZE = 512 MAX_TOKENS = 4096 WORKERS = 4 EMBEDDING_BATCH_SIZE = 32 # Load the embedding model model = SentenceTransformer(MODEL_NAME) tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) class UniversityKnowledgeBase: def __init__(self): self.index = faiss.IndexFlatL2(model.get_sentence_embedding_dimension()) self.chunks = [] self.loaded = False self.total_chunks = 0 def load_dataset(self): """Loads and thoroughly processes the University dataset""" try: print("\n" + "="*50) print("Loading University of Education, Lahore dataset...") print("="*50 + "\n") # Fetch dataset with error handling response = requests.get(DATASET_URL, timeout=30) if response.status_code != 200: raise Exception(f"Failed to fetch dataset. HTTP Status: {response.status_code}") # Parse JSON content try: data = response.json() except json.JSONDecodeError: raise Exception("Invalid JSON format in dataset") if not isinstance(data, list): raise Exception("Dataset format is invalid. Expected a list of Q&A pairs.") # Process all content with progress tracking self.chunks = [] with tqdm(data, desc="Processing dataset") as progress_bar: for item in progress_bar: if isinstance(item, dict): if 'question' in item and 'answer' in item: # Create comprehensive Q&A chunks self.chunks.append(f"QUESTION: {item['question'].strip()}\nANSWER: {item['answer'].strip()}\n") elif 'text' in item: # Process text content with semantic chunking text = item['text'].strip() if len(text) > CHUNK_SIZE: sentences = re.split(r'(?
Ask any question about the university in English, Urdu, or Roman Urdu
""") # Initialize dataset load_status = knowledge_base.load_dataset() with gr.Row(): with gr.Column(scale=1): gr.Markdown("### Knowledge Base Status") status = gr.Textbox( label="Dataset Status", value=load_status, interactive=False, lines=2 ) reload_btn = gr.Button("🔄 Reload Knowledge Base", variant="secondary") gr.Markdown(""" **Note:** This chatbot answers strictly based on the official University of Education, Lahore dataset. """) with gr.Column(scale=2): chatbot = gr.Chatbot( height=500, label="Conversation History", bubble_full_width=False ) question = gr.Textbox( label="Your Question", placeholder="Type your question about the university...", lines=2, max_lines=5 ) with gr.Row(): ask_btn = gr.Button("Ask Question", variant="primary") clear_btn = gr.Button("Clear Conversation", variant="secondary") # Event handlers reload_btn.click( fn=lambda: knowledge_base.load_dataset(), inputs=None, outputs=status, queue=False ) ask_btn.click( fn=chatbot_response, inputs=[question, chatbot], outputs=chatbot, queue=True ).then(lambda: "", None, question) clear_btn.click( fn=lambda: [], inputs=None, outputs=chatbot, queue=False ) # Launch the application if __name__ == "__main__": app.launch(server_name="0.0.0.0", server_port=7860)