import json from sentence_transformers import SentenceTransformer, util from groq import Groq from datetime import datetime import requests from io import BytesIO from PIL import Image, ImageDraw, ImageFont import numpy as np from dotenv import load_dotenv import os from datasets import load_dataset, Dataset, DatasetDict import pandas as pd # Load environment variables load_dotenv() # Initialize Groq client groq_client = Groq(api_key=os.getenv("GROQ_API_KEY")) # Load models and dataset similarity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2') # Configuration HF_DATASET_REPO = "midrees2806/unmatched_queries" # Your dataset repo HF_TOKEN = os.getenv("HF_TOKEN") # From Space secrets # Greeting words list GREETINGS = [ "hi", "hello", "hey", "good morning", "good afternoon", "good evening", "assalam o alaikum", "salam", "namaste", "hola", "bonjour", "hi there", "hey there", "greetings", "howdy" ] # --- Dataset Loading --- try: with open('dataset.json', 'r') as f: dataset = json.load(f) if not all(isinstance(item, dict) and 'input' in item and 'response' in item for item in dataset): raise ValueError("Invalid dataset structure") except Exception as e: print(f"Error loading dataset: {e}") dataset = [] # Precompute embeddings dataset_questions = [item.get("input", "").lower().strip() for item in dataset] dataset_answers = [item.get("response", "") for item in dataset] dataset_embeddings = similarity_model.encode(dataset_questions, convert_to_tensor=True) # --- Unmatched Queries Handler --- def manage_unmatched_queries(query: str): """Save unmatched queries to HF Dataset with error handling""" try: timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") # Load existing dataset or create new try: ds = load_dataset(HF_DATASET_REPO, token=HF_TOKEN) df = ds["train"].to_pandas() except: df = pd.DataFrame(columns=["Query", "Timestamp", "Processed"]) # Append new query (avoid duplicates) if query not in df["Query"].values: new_entry = {"Query": query, "Timestamp": timestamp, "Processed": False} df = pd.concat([df, pd.DataFrame([new_entry])], ignore_index=True) # Push to Hub updated_ds = Dataset.from_pandas(df) updated_ds.push_to_hub(HF_DATASET_REPO, token=HF_TOKEN) except Exception as e: print(f"Failed to save query: {e}") # --- Enhanced LLM Query --- def query_groq_llm(prompt, model_name="llama3-70b-8192"): try: chat_completion = groq_client.chat.completions.create( messages=[{ "role": "user", "content": prompt }], model=model_name, temperature=0.7, max_tokens=500 ) return chat_completion.choices[0].message.content.strip() except Exception as e: print(f"Error querying Groq API: {e}") return "" def handle_submit(): user_input = input_field.value.strip() if not user_input: show_message("Please enter a question") return response = get_best_answer(user_input) if response.get('should_scroll', False): scroll_to_answer() display_response(response.get('response', '')) def get_best_answer(user_input): # 1. Check for empty input if not user_input.strip(): return None # This will be handled in the frontend to prevent submission user_input_lower = user_input.lower().strip() # 2. Check for minimum word count (3 words) if len(user_input_lower.split()) < 3 and not any(greet in user_input_lower for greet in GREETINGS): return "Please ask your question properly with at least 3 words." # 3. Handle greetings (regardless of word count) if any(greet in user_input_lower for greet in GREETINGS): greeting_response = query_groq_llm( f"You are an official assistant for University of Education Lahore. " f"Respond to this greeting in a friendly and professional manner: {user_input}" ) return greeting_response if greeting_response else "Hello! How can I assist you today?" # 4. Check if question is about fee if any(keyword in user_input_lower for keyword in ["fee structure", "fees structure", "semester fees", "semester fee"]): return ( "💰 For complete and up-to-date fee details for this program, we recommend visiting the official University of Education fee structure page.\n" "You'll find comprehensive information regarding tuition, admission charges, and other applicable fees there.\n" "🔗 https://ue.edu.pk/allfeestructure.php" ) # 🔁 Continue with normal similarity-based logic user_embedding = similarity_model.encode(user_input_lower, convert_to_tensor=True) similarities = util.pytorch_cos_sim(user_embedding, dataset_embeddings)[0] best_match_idx = similarities.argmax().item() best_score = similarities[best_match_idx].item() # Save unmatched queries (threshold = 0.65) if best_score < 0.65: manage_unmatched_queries(user_input) if best_score >= 0.65: original_answer = dataset_answers[best_match_idx] prompt = f"""As an official assistant for University of Education Lahore, provide a clear response: Question: {user_input} Original Answer: {original_answer} Improved Answer:""" else: prompt = f"""As an official assistant for University of Education Lahore, provide a helpful response: Include relevant details about university policies. If unsure, direct to official channels. Question: {user_input} Official Answer:""" llm_response = query_groq_llm(prompt) if llm_response: for marker in ["Improved Answer:", "Official Answer:"]: if marker in llm_response: response = llm_response.split(marker)[-1].strip() break else: response = llm_response else: response = dataset_answers[best_match_idx] if best_score >= 0.65 else """For official information: 📞 +92-42-99262231-33 ✉️ info@ue.edu.pk 🌐 ue.edu.pk""" # Return the response along with a flag to indicate auto-scrolling should happen return { "response": response, "should_scroll": True # Frontend should use this to trigger auto-scrolling }