import json from datetime import datetime from sentence_transformers import SentenceTransformer, util from groq import Groq from dotenv import load_dotenv import os from datasets import load_dataset, Dataset, DatasetDict import pandas as pd # Load environment variables load_dotenv() # Initialize clients groq_client = Groq(api_key=os.getenv("GROQ_API_KEY")) similarity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2') # Configuration HF_DATASET_REPO = "midrees2806/unmatched_queries" # Your dataset repo HF_TOKEN = os.getenv("HF_TOKEN") # From Space secrets # --- Dataset Loading --- try: with open('dataset.json', 'r') as f: dataset = json.load(f) if not all(isinstance(item, dict) and 'input' in item and 'response' in item for item in dataset): raise ValueError("Invalid dataset structure") except Exception as e: print(f"Error loading dataset: {e}") dataset = [] # Precompute embeddings dataset_questions = [item.get("input", "").lower().strip() for item in dataset] dataset_answers = [item.get("response", "") for item in dataset] dataset_embeddings = similarity_model.encode(dataset_questions, convert_to_tensor=True) # --- Unmatched Queries Handler --- def manage_unmatched_queries(query: str): """Save unmatched queries to HF Dataset with error handling""" try: timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") # Load existing dataset or create new try: ds = load_dataset(HF_DATASET_REPO, token=HF_TOKEN) df = ds["train"].to_pandas() except: df = pd.DataFrame(columns=["Query", "Timestamp", "Processed"]) # Append new query (avoid duplicates) if query not in df["Query"].values: new_entry = {"Query": query, "Timestamp": timestamp, "Processed": False} df = pd.concat([df, pd.DataFrame([new_entry])], ignore_index=True) # Push to Hub updated_ds = Dataset.from_pandas(df) updated_ds.push_to_hub(HF_DATASET_REPO, token=HF_TOKEN) except Exception as e: print(f"Failed to save query: {e}") # --- Enhanced LLM Query --- def query_llm(prompt: str, model: str = "llama3-70b-8192") -> str: try: response = groq_client.chat.completions.create( messages=[{"role": "user", "content": prompt}], model=model, temperature=0.7, max_tokens=1024, top_p=0.9 ) return response.choices[0].message.content.strip() except Exception as e: print(f"LLM Error: {e}") return None # --- Main Chat Function --- def get_best_answer(user_input: str) -> str: user_input = user_input.strip() lower_input = user_input.lower() # 1. Handle special cases if any(kw in lower_input for kw in ["fee", "fees", "tuition"]): return ("💰 Fee information:\n" "Please visit: https://ue.edu.pk/allfeestructure.php\n" "For personalized help, contact accounts@ue.edu.pk") # 2. Semantic similarity search query_embedding = similarity_model.encode(lower_input, convert_to_tensor=True) scores = util.pytorch_cos_sim(query_embedding, dataset_embeddings)[0] best_idx = scores.argmax().item() best_score = scores[best_idx].item() # 3. Save unmatched queries (threshold = 0.65) if best_score < 0.65: manage_unmatched_queries(user_input) # 4. Generate response if best_score >= 0.65: context = dataset_answers[best_idx] prompt = f"""University Assistant Task: Question: {user_input} Context: {context} Generate a helpful, accurate response using the context. If unsure, say "Please contact info@ue.edu.pk" """ else: prompt = f"""As an official University of Education assistant, answer: Question: {user_input} Guidelines: - Be polite and professional - Direct to official channels if uncertain - Keep responses under 3 sentences""" response = query_llm(prompt) return response or """For official assistance: 📞 +92-42-99262231-33 ✉️ info@ue.edu.pk"""