Spaces:

midrees2806
/

Practice

Sleeping

App Files Files Community

midrees2806 commited on Apr 30

Commit

ade1780

verified ·

1 Parent(s): baf9f53

Update rag.py

Browse files

Files changed (1) hide show

rag.py +47 -40

rag.py CHANGED Viewed

@@ -1,9 +1,12 @@
 import json
 from sentence_transformers import SentenceTransformer, util
 from groq import Groq
-import os
-import csv
 from dotenv import load_dotenv
 # Load environment variables
 load_dotenv()
@@ -11,21 +14,34 @@ load_dotenv()
 # Initialize Groq client
 groq_client = Groq(api_key=os.getenv("GROQ_API_KEY"))
-# Load similarity model
 similarity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
 # Load dataset
-with open('dataset.json', 'r', encoding='utf-8') as f:
-    dataset = json.load(f)
 # Precompute embeddings
 dataset_questions = [item.get("input", "").lower().strip() for item in dataset]
 dataset_answers = [item.get("response", "") for item in dataset]
 dataset_embeddings = similarity_model.encode(dataset_questions, convert_to_tensor=True)
-# Use absolute path for unmatched_queries.csv
-base_dir = os.path.dirname(os.path.abspath(__file__))
-file_path = os.path.join(base_dir, "unmatched_queries.csv")
 def query_groq_llm(prompt, model_name="llama3-70b-8192"):
     try:
@@ -40,48 +56,39 @@ def query_groq_llm(prompt, model_name="llama3-70b-8192"):
         )
         return chat_completion.choices[0].message.content.strip()
     except Exception as e:
-        print(f"[ERROR] Groq API: {e}")
-        return ""
-def log_unmatched_query(query):
     try:
-        # Create file with header if not exists
-        if not os.path.exists(file_path):
-            with open(file_path, mode="w", newline="", encoding="utf-8") as file:
-                writer = csv.writer(file)
-                writer.writerow(["Unmatched Queries"])
-        # Append unmatched query
-        with open(file_path, mode="a", newline="", encoding="utf-8") as file:
-            writer = csv.writer(file)
-            writer.writerow([query])
-        print(f"[DEBUG] Logged unmatched query: {query}")
     except Exception as e:
-        print(f"[ERROR] Logging unmatched query failed: {e}")
 def get_best_answer(user_input):
     user_input_lower = user_input.lower().strip()
-    # 🧾 Fee-specific shortcut
     if any(keyword in user_input_lower for keyword in ["fee", "fees", "charges", "semester fee"]):
         return (
             "💰 For complete and up-to-date fee details for this program, we recommend visiting the official University of Education fee structure page.\n"
-            "You’ll find comprehensive information regarding tuition, admission charges, and other applicable fees there.\n"
             "🔗 https://ue.edu.pk/allfeestructure.php"
         )
-    # 🔍 Similarity matching
     user_embedding = similarity_model.encode(user_input_lower, convert_to_tensor=True)
     similarities = util.pytorch_cos_sim(user_embedding, dataset_embeddings)[0]
     best_match_idx = similarities.argmax().item()
     best_score = similarities[best_match_idx].item()
-    # ✏️ Log unmatched queries
     if best_score < 0.65:
-        log_unmatched_query(user_input)
-    # 🧠 Prompt for LLM
     if best_score >= 0.65:
         original_answer = dataset_answers[best_match_idx]
         prompt = f"""As an official assistant for University of Education Lahore, provide a clear response:
@@ -95,19 +102,19 @@ def get_best_answer(user_input):
         Question: {user_input}
         Official Answer:"""
-    # 🔗 Query Groq LLM
     llm_response = query_groq_llm(prompt)
-    # ✂️ Process LLM output
     if llm_response:
         for marker in ["Improved Answer:", "Official Answer:"]:
             if marker in llm_response:
-                return llm_response.split(marker)[-1].strip()
-        return llm_response
     else:
-        return dataset_answers[best_match_idx] if best_score >= 0.65 else (
-            "For official information:\n"
-            "📞 +92-42-99262231-33\n"
-            "✉️ info@ue.edu.pk\n"
-            "🌐 ue.edu.pk"
-        )

 import json
+import csv
+from pathlib import Path
+from datetime import datetime
 from sentence_transformers import SentenceTransformer, util
 from groq import Groq
 from dotenv import load_dotenv
+import os
+import pandas as pd
 # Load environment variables
 load_dotenv()
 # Initialize Groq client
 groq_client = Groq(api_key=os.getenv("GROQ_API_KEY"))
+# Load models and dataset
 similarity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
 # Load dataset
+try:
+    with open('dataset.json', 'r') as f:
+        dataset = json.load(f)
+    # Validate dataset structure
+    if not all(isinstance(item, dict) and 'input' in item and 'response' in item for item in dataset):
+        raise ValueError("Invalid dataset structure")
+except (json.JSONDecodeError, ValueError, FileNotFoundError) as e:
+    print(f"Error loading dataset: {e}")
+    dataset = []
 # Precompute embeddings
 dataset_questions = [item.get("input", "").lower().strip() for item in dataset]
 dataset_answers = [item.get("response", "") for item in dataset]
 dataset_embeddings = similarity_model.encode(dataset_questions, convert_to_tensor=True)
+# Initialize unmatched queries CSV
+def init_unmatched_queries_file():
+    csv_file = Path('unmatched_queries.csv')
+    if not csv_file.exists():
+        with open(csv_file, 'w', newline='', encoding='utf-8') as f:
+            writer = csv.writer(f)
+            writer.writerow(['Unmatched Queries', 'Timestamp'])
+init_unmatched_queries_file()
 def query_groq_llm(prompt, model_name="llama3-70b-8192"):
     try:
         )
         return chat_completion.choices[0].message.content.strip()
     except Exception as e:
+        print(f"Error querying Groq API: {e}")
+        return None
+def save_unmatched_query(query):
     try:
+        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        with open('unmatched_queries.csv', 'a', newline='', encoding='utf-8') as f:
+            writer = csv.writer(f)
+            writer.writerow([query, timestamp])
     except Exception as e:
+        print(f"Error saving unmatched query: {e}")
 def get_best_answer(user_input):
     user_input_lower = user_input.lower().strip()
+    # Handle fee-related questions
     if any(keyword in user_input_lower for keyword in ["fee", "fees", "charges", "semester fee"]):
         return (
             "💰 For complete and up-to-date fee details for this program, we recommend visiting the official University of Education fee structure page.\n"
+            "You'll find comprehensive information regarding tuition, admission charges, and other applicable fees there.\n"
             "🔗 https://ue.edu.pk/allfeestructure.php"
         )
+    # Similarity matching
     user_embedding = similarity_model.encode(user_input_lower, convert_to_tensor=True)
     similarities = util.pytorch_cos_sim(user_embedding, dataset_embeddings)[0]
     best_match_idx = similarities.argmax().item()
     best_score = similarities[best_match_idx].item()
+    # Save unmatched queries
     if best_score < 0.65:
+        save_unmatched_query(user_input)
     if best_score >= 0.65:
         original_answer = dataset_answers[best_match_idx]
         prompt = f"""As an official assistant for University of Education Lahore, provide a clear response:
         Question: {user_input}
         Official Answer:"""
     llm_response = query_groq_llm(prompt)
     if llm_response:
         for marker in ["Improved Answer:", "Official Answer:"]:
             if marker in llm_response:
+                response = llm_response.split(marker)[-1].strip()
+                break
+        else:
+            response = llm_response
     else:
+        response = dataset_answers[best_match_idx] if best_score >= 0.65 else """For official information:
+        📞 +92-42-99262231-33
+        ✉️ [email protected]
+        🌐 ue.edu.pk"""
+    return response