Spaces:

midrees2806
/

Chatbot

Running

App Files Files Community

Chatbot / rag.py

midrees2806

Update rag.py

2cb70a0 verified 25 days ago

raw

history blame

6.58 kB

	import json
	from sentence_transformers import SentenceTransformer, util
	from groq import Groq
	from datetime import datetime
	import requests
	from io import BytesIO
	from PIL import Image, ImageDraw, ImageFont
	import numpy as np
	from dotenv import load_dotenv
	import os
	from datasets import load_dataset, Dataset, DatasetDict
	import pandas as pd

	# Load environment variables
	load_dotenv()

	# Initialize Groq client
	groq_client = Groq(api_key=os.getenv("GROQ_API_KEY"))

	# Load models and dataset
	similarity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

	# Configuration
	HF_DATASET_REPO = "midrees2806/unmatched_queries" # Your dataset repo
	HF_TOKEN = os.getenv("HF_TOKEN") # From Space secrets

	# Greeting words list
	GREETINGS = [
	"hi", "hello", "hey", "good morning", "good afternoon", "good evening",
	"assalam o alaikum", "salam", "namaste", "hola", "bonjour", "hi there",
	"hey there", "greetings", "howdy"
	]

	# --- Dataset Loading ---
	try:
	with open('dataset.json', 'r') as f:
	dataset = json.load(f)
	if not all(isinstance(item, dict) and 'input' in item and 'response' in item for item in dataset):
	raise ValueError("Invalid dataset structure")
	except Exception as e:
	print(f"Error loading dataset: {e}")
	dataset = []

	# Precompute embeddings
	dataset_questions = [item.get("input", "").lower().strip() for item in dataset]
	dataset_answers = [item.get("response", "") for item in dataset]
	dataset_embeddings = similarity_model.encode(dataset_questions, convert_to_tensor=True)

	# --- Unmatched Queries Handler ---
	def manage_unmatched_queries(query: str):
	"""Save unmatched queries to HF Dataset with error handling"""
	try:
	timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

	# Load existing dataset or create new
	try:
	ds = load_dataset(HF_DATASET_REPO, token=HF_TOKEN)
	df = ds["train"].to_pandas()
	except:
	df = pd.DataFrame(columns=["Query", "Timestamp", "Processed"])

	# Append new query (avoid duplicates)
	if query not in df["Query"].values:
	new_entry = {"Query": query, "Timestamp": timestamp, "Processed": False}
	df = pd.concat([df, pd.DataFrame([new_entry])], ignore_index=True)

	# Push to Hub
	updated_ds = Dataset.from_pandas(df)
	updated_ds.push_to_hub(HF_DATASET_REPO, token=HF_TOKEN)
	except Exception as e:
	print(f"Failed to save query: {e}")

	# --- Enhanced LLM Query ---
	def query_groq_llm(prompt, model_name="llama3-70b-8192"):
	try:
	chat_completion = groq_client.chat.completions.create(
	messages=[{
	"role": "user",
	"content": prompt
	}],
	model=model_name,
	temperature=0.7,
	max_tokens=500
	)
	return chat_completion.choices[0].message.content.strip()
	except Exception as e:
	print(f"Error querying Groq API: {e}")
	return ""

	def handle_submit():
	user_input = input_field.value.strip()

	if not user_input:
	show_message("Please enter a question")
	return

	response = get_best_answer(user_input)

	if response.get('should_scroll', False):
	scroll_to_answer()

	display_response(response.get('response', ''))

	def get_best_answer(user_input):
	# 1. Check for empty input
	if not user_input.strip():
	return None # This will be handled in the frontend to prevent submission

	user_input_lower = user_input.lower().strip()

	# 2. Check for minimum word count (3 words)
	if len(user_input_lower.split()) < 3 and not any(greet in user_input_lower for greet in GREETINGS):
	return "Please ask your question properly with at least 3 words."

	# 3. Handle greetings (regardless of word count)
	if any(greet in user_input_lower for greet in GREETINGS):
	greeting_response = query_groq_llm(
	f"You are an official assistant for University of Education Lahore. "
	f"Respond to this greeting in a friendly and professional manner: {user_input}"
	)
	return greeting_response if greeting_response else "Hello! How can I assist you today?"

	# 4. Check if question is about fee
	if any(keyword in user_input_lower for keyword in ["fee structure", "fees structure", "semester fees", "semester fee"]):
	return (
	"💰 For complete and up-to-date fee details for this program, we recommend visiting the official University of Education fee structure page.\n"
	"You'll find comprehensive information regarding tuition, admission charges, and other applicable fees there.\n"
	"🔗 https://ue.edu.pk/allfeestructure.php"
	)

	# 🔁 Continue with normal similarity-based logic
	user_embedding = similarity_model.encode(user_input_lower, convert_to_tensor=True)
	similarities = util.pytorch_cos_sim(user_embedding, dataset_embeddings)[0]
	best_match_idx = similarities.argmax().item()
	best_score = similarities[best_match_idx].item()

	# Save unmatched queries (threshold = 0.65)
	if best_score < 0.65:
	manage_unmatched_queries(user_input)

	if best_score >= 0.65:
	original_answer = dataset_answers[best_match_idx]
	prompt = f"""As an official assistant for University of Education Lahore, provide a clear response:
	Question: {user_input}
	Original Answer: {original_answer}
	Improved Answer:"""
	else:
	prompt = f"""As an official assistant for University of Education Lahore, provide a helpful response:
	Include relevant details about university policies.
	If unsure, direct to official channels.
	Question: {user_input}
	Official Answer:"""

	llm_response = query_groq_llm(prompt)

	if llm_response:
	for marker in ["Improved Answer:", "Official Answer:"]:
	if marker in llm_response:
	response = llm_response.split(marker)[-1].strip()
	break
	else:
	response = llm_response
	else:
	response = dataset_answers[best_match_idx] if best_score >= 0.65 else """For official information:
	📞 +92-42-99262231-33
	✉️ [email protected]
	🌐 ue.edu.pk"""

	# Return the response along with a flag to indicate auto-scrolling should happen
	return {
	"response": response,
	"should_scroll": True # Frontend should use this to trigger auto-scrolling
	}