import json from sentence_transformers import SentenceTransformer, util from groq import Groq from datetime import datetime import os import pandas as pd from datasets import load_dataset, Dataset from dotenv import load_dotenv # Load environment variables load_dotenv() # Initialize Groq client groq_client = Groq(api_key=os.getenv("GROQ_API_KEY")) # Load similarity model similarity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2') # Config HF_DATASET_REPO = "midrees2806/unmatched_queries" HF_TOKEN = os.getenv("HF_TOKEN") # Greeting list GREETINGS = [ "hi", "hello", "hey", "good morning", "good afternoon", "good evening", "assalam o alaikum", "salam", "aoa", "hi there", "hey there", "greetings" ] # Load local dataset try: with open('dataset.json', 'r') as f: dataset = json.load(f) if not all(isinstance(item, dict) and 'Question' in item and 'Answer' in item for item in dataset): raise ValueError("Invalid dataset structure") except Exception as e: print(f"Error loading dataset: {e}") dataset = [] # Precompute embeddings dataset_questions = [item.get("Question", "").lower().strip() for item in dataset] dataset_answers = [item.get("Answer", "") for item in dataset] dataset_embeddings = similarity_model.encode(dataset_questions, convert_to_tensor=True) # Save unmatched queries to Hugging Face def manage_unmatched_queries(query: str): try: timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") try: ds = load_dataset(HF_DATASET_REPO, token=HF_TOKEN) df = ds["train"].to_pandas() except: df = pd.DataFrame(columns=["Query", "Timestamp", "Processed"]) if query not in df["Query"].values: new_entry = {"Query": query, "Timestamp": timestamp, "Processed": False} df = pd.concat([df, pd.DataFrame([new_entry])], ignore_index=True) updated_ds = Dataset.from_pandas(df) updated_ds.push_to_hub(HF_DATASET_REPO, token=HF_TOKEN) except Exception as e: print(f"Failed to save query: {e}") # Query Groq LLM def query_groq_llm(prompt, model_name="llama3-70b-8192"): try: chat_completion = groq_client.chat.completions.create( messages=[{ "role": "user", "content": prompt }], model=model_name, temperature=0.7, max_tokens=500 ) return chat_completion.choices[0].message.content.strip() except Exception as e: print(f"Error querying Groq API: {e}") return "" # Main logic function to be called from Gradio def get_best_answer(user_input): if not user_input.strip(): return "Please enter a valid question." user_input_lower = user_input.lower().strip() if len(user_input_lower.split()) < 3 and not any(greet in user_input_lower for greet in GREETINGS): return "Please ask your question properly with at least 3 words." if any(greet in user_input_lower for greet in GREETINGS): greeting_response = query_groq_llm( f"You are an official assistant for University of Education Lahore. " f"Respond to this greeting in a friendly and professional manner: {user_input}" ) return greeting_response if greeting_response else "Hello! How can I assist you today?" if any(keyword in user_input_lower for keyword in ["fee structure", "fees structure", "semester fees", "semester fee"]): return ( "💰 For complete and up-to-date fee details for this program, we recommend visiting the official University of Education fee structure page.\n" "You'll find comprehensive information regarding tuition, admission charges, and other applicable fees there.\n" "🔗 https://ue.edu.pk/allfeestructure.php" ) user_embedding = similarity_model.encode(user_input_lower, convert_to_tensor=True) similarities = util.pytorch_cos_sim(user_embedding, dataset_embeddings)[0] best_match_idx = similarities.argmax().item() best_score = similarities[best_match_idx].item() if best_score < 0.65: manage_unmatched_queries(user_input) if best_score >= 0.65: original_answer = dataset_answers[best_match_idx] prompt = f"""As an official assistant for University of Education Lahore, provide a clear response: Question: {user_input} Original Answer: {original_answer} Improved Answer:""" else: prompt = f"""As an official assistant for University of Education Lahore, provide a helpful response: Include relevant details about university policies. If unsure, direct to official channels. Question: {user_input} Official Answer:""" llm_response = query_groq_llm(prompt) if llm_response: for marker in ["Improved Answer:", "Official Answer:", "Rephrased Answer:"]: if marker in llm_response: return llm_response.split(marker)[-1].strip() return llm_response else: return dataset_answers[best_match_idx] if best_score >= 0.65 else ( "For official information:\n" "📞 +92-42-99262231-33\n" "✉️ info@ue.edu.pk\n" "🌐 https://ue.edu.pk" )