import json from sentence_transformers import SentenceTransformer, util from groq import Groq from datetime import datetime import requests from datasets import load_dataset, Dataset from io import BytesIO from PIL import Image, ImageDraw, ImageFont import numpy as np from dotenv import load_dotenv import os import pandas as pd # Load environment variables load_dotenv() # Initialize Groq client groq_client = Groq(api_key=os.getenv("GROQ_API_KEY")) # Load models and dataset similarity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2') # Config HF_DATASET_REPO = "midrees2806/unmatched_queries" HF_TOKEN = os.getenv("HF_TOKEN") # Load dataset (automatically using the path) with open('dataset.json', 'r') as f: dataset = json.load(f) # Precompute embeddings dataset_questions = [item.get("Question", "").lower().strip() for item in dataset] dataset_answers = [item.get("Answer", "") for item in dataset] dataset_embeddings = similarity_model.encode(dataset_questions, convert_to_tensor=True) # Save unmatched queries to Hugging Face def manage_unmatched_queries(query: str): try: timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") try: ds = load_dataset(HF_DATASET_REPO, token=HF_TOKEN) df = ds["train"].to_pandas() except: df = pd.DataFrame(columns=["Query", "Timestamp", "Processed"]) if query not in df["Query"].values: new_entry = {"Query": query, "Timestamp": timestamp, "Processed": False} df = pd.concat([df, pd.DataFrame([new_entry])], ignore_index=True) updated_ds = Dataset.from_pandas(df) updated_ds.push_to_hub(HF_DATASET_REPO, token=HF_TOKEN) except Exception as e: print(f"Failed to save query: {e}") def query_groq_llm(prompt, model_name="llama3-70b-8192"): try: chat_completion = groq_client.chat.completions.create( messages=[{ "role": "user", "content": prompt }], model=model_name, temperature=0.7, max_tokens=500 ) return chat_completion.choices[0].message.content.strip() except Exception as e: print(f"Error querying Groq API: {e}") return "" def get_best_answer(user_input): if not user_input.strip(): return "Please enter a valid question." user_input_lower = user_input.lower().strip() if len(user_input_lower.split()) < 3: return "Please ask your question properly with at least 3 words." # 👉 Check if question is about fee if any(keyword in user_input_lower for keyword in ["fee structure", "fees structure"]): return ( "💰 For complete and up-to-date fee details for this program, we recommend visiting the official University of Education fee structure page.\n" "You’ll find comprehensive information regarding tuition, admission charges, and other applicable fees there.\n" "🔗 https://ue.edu.pk/allfeestructure.php" ) # 🔁 Continue with normal similarity-based logic user_embedding = similarity_model.encode(user_input_lower, convert_to_tensor=True) similarities = util.pytorch_cos_sim(user_embedding, dataset_embeddings)[0] best_match_idx = similarities.argmax().item() best_score = similarities[best_match_idx].item() if best_score < 0.65: manage_unmatched_queries(user_input) if best_score >= 0.65: original_answer = dataset_answers[best_match_idx] prompt = f"""As an official assistant for University of Education Lahore, provide a clear response: Question: {user_input} Original Answer: {original_answer} Improved Answer:""" else: prompt = f"""As an official assistant for University of Education Lahore, provide a helpful response: Include relevant details about university policies. If unsure, direct to official channels. Question: {user_input} Official Answer:""" llm_response = query_groq_llm(prompt) if llm_response: for marker in ["Improved Answer:", "Official Answer:"]: if marker in llm_response: response = llm_response.split(marker)[-1].strip() break else: response = llm_response else: response = dataset_answers[best_match_idx] if best_score >= 0.65 else """For official information: 📞 +92-42-99262231-33 ✉️ info@ue.edu.pk 🌐 ue.edu.pk""" return response