midrees2806 commited on
Commit
a9f7e8d
Β·
verified Β·
1 Parent(s): da4b0c7

Update rag.py

Browse files
Files changed (1) hide show
  1. rag.py +24 -81
rag.py CHANGED
@@ -7,8 +7,6 @@ from io import BytesIO
7
  from PIL import Image, ImageDraw, ImageFont
8
  import numpy as np
9
  from dotenv import load_dotenv
10
- from datasets import load_dataset, Dataset
11
- import pandas as pd
12
  import os
13
 
14
  # Load environment variables
@@ -20,16 +18,7 @@ groq_client = Groq(api_key=os.getenv("GROQ_API_KEY"))
20
  # Load models and dataset
21
  similarity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
22
 
23
- # Constants
24
- HF_DATASET_REPO = "midrees2806/unmatched_queries"
25
- HF_TOKEN = os.getenv("HF_TOKEN")
26
- GREETINGS = [
27
- "hi", "hello", "hey", "good morning", "good afternoon", "good evening",
28
- "assalam o alaikum", "salam", "aoa", "hi there",
29
- "hey there", "greetings"
30
- ]
31
-
32
- # Load dataset
33
  with open('dataset.json', 'r') as f:
34
  dataset = json.load(f)
35
 
@@ -38,24 +27,6 @@ dataset_questions = [item.get("input", "").lower().strip() for item in dataset]
38
  dataset_answers = [item.get("response", "") for item in dataset]
39
  dataset_embeddings = similarity_model.encode(dataset_questions, convert_to_tensor=True)
40
 
41
- # Save unmatched queries to Hugging Face
42
- def manage_unmatched_queries(query: str):
43
- try:
44
- timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
45
- try:
46
- ds = load_dataset(HF_DATASET_REPO, token=HF_TOKEN)
47
- df = ds["train"].to_pandas()
48
- except:
49
- df = pd.DataFrame(columns=["Query", "Timestamp", "Processed"])
50
- if query not in df["Query"].values:
51
- new_entry = {"Query": query, "Timestamp": timestamp, "Processed": False}
52
- df = pd.concat([df, pd.DataFrame([new_entry])], ignore_index=True)
53
- updated_ds = Dataset.from_pandas(df)
54
- updated_ds.push_to_hub(HF_DATASET_REPO, token=HF_TOKEN)
55
- except Exception as e:
56
- print(f"Failed to save query: {e}")
57
-
58
- # Query Groq LLM
59
  def query_groq_llm(prompt, model_name="llama3-70b-8192"):
60
  try:
61
  chat_completion = groq_client.chat.completions.create(
@@ -72,77 +43,49 @@ def query_groq_llm(prompt, model_name="llama3-70b-8192"):
72
  print(f"Error querying Groq API: {e}")
73
  return ""
74
 
75
- # Main logic function
76
  def get_best_answer(user_input):
77
- if not user_input.strip():
78
- return "Please enter a valid question."
79
-
80
  user_input_lower = user_input.lower().strip()
81
 
82
- # 🟑 Greet back if user greets
83
- if any(greet in user_input_lower for greet in GREETINGS):
84
- greeting_response = query_groq_llm(
85
- f"You are an official assistant for University of Education Lahore. "
86
- f"Respond to this greeting in a friendly and professional manner: {user_input}"
87
- )
88
- return greeting_response if greeting_response else "Hello! How can I assist you today?"
89
-
90
- # πŸ’° Fee-specific shortcut
91
- if any(keyword in user_input_lower for keyword in ["semester fee", "semester fees"]):
92
  return (
93
  "πŸ’° For complete and up-to-date fee details for this program, we recommend visiting the official University of Education fee structure page.\n"
94
  "You’ll find comprehensive information regarding tuition, admission charges, and other applicable fees there.\n"
95
  "πŸ”— https://ue.edu.pk/allfeestructure.php"
96
  )
97
 
98
- # πŸ” Similarity-based matching
99
  user_embedding = similarity_model.encode(user_input_lower, convert_to_tensor=True)
100
  similarities = util.pytorch_cos_sim(user_embedding, dataset_embeddings)[0]
101
  best_match_idx = similarities.argmax().item()
102
  best_score = similarities[best_match_idx].item()
103
 
104
- if best_score < 0.65:
105
- manage_unmatched_queries(user_input)
106
-
107
- # 🧠 Use original dataset answer if matched well
108
  if best_score >= 0.65:
109
  original_answer = dataset_answers[best_match_idx]
110
- prompt = f"""Name is UOE AI Assistant! You are an official assistant for the University of Education Lahore.
111
-
112
- Rephrase the following official answer clearly and professionally.
113
- Use structured formatting (like headings, bullet points, or numbered lists) where appropriate.
114
- DO NOT add any new or extra information. ONLY rephrase and improve the clarity and formatting of the original answer.
115
-
116
- ### Question:
117
- {user_input}
118
-
119
- ### Original Answer:
120
- {original_answer}
121
-
122
- ### Rephrased Answer:
123
- """
124
  else:
125
- prompt = f"""Name is UOE AI Assistant! As an official assistant for University of Education Lahore, provide a helpful response:
126
- Include relevant details about university policies.
127
- If unsure, direct to official channels.
128
-
129
- ### Question:
130
- {user_input}
131
-
132
- ### Official Answer:
133
- """
134
 
135
  llm_response = query_groq_llm(prompt)
136
 
137
  if llm_response:
138
- for marker in ["Improved Answer:", "Official Answer:", "Rephrased Answer:"]:
139
  if marker in llm_response:
140
- return llm_response.split(marker)[-1].strip()
141
- return llm_response
 
 
142
  else:
143
- return dataset_answers[best_match_idx] if best_score >= 0.65 else (
144
- "For official information:\n"
145
- "πŸ“ž +92-42-99262231-33\n"
146
- "βœ‰οΈ info@ue.edu.pk\n"
147
- "🌐 https://ue.edu.pk"
148
- )
 
7
  from PIL import Image, ImageDraw, ImageFont
8
  import numpy as np
9
  from dotenv import load_dotenv
 
 
10
  import os
11
 
12
  # Load environment variables
 
18
  # Load models and dataset
19
  similarity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
20
 
21
+ # Load dataset (automatically using the path)
 
 
 
 
 
 
 
 
 
22
  with open('dataset.json', 'r') as f:
23
  dataset = json.load(f)
24
 
 
27
  dataset_answers = [item.get("response", "") for item in dataset]
28
  dataset_embeddings = similarity_model.encode(dataset_questions, convert_to_tensor=True)
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  def query_groq_llm(prompt, model_name="llama3-70b-8192"):
31
  try:
32
  chat_completion = groq_client.chat.completions.create(
 
43
  print(f"Error querying Groq API: {e}")
44
  return ""
45
 
 
46
  def get_best_answer(user_input):
 
 
 
47
  user_input_lower = user_input.lower().strip()
48
 
49
+ # πŸ‘‰ Check if question is about fee
50
+ if any(keyword in user_input_lower for keyword in ["semester fee","semester fees"]):
 
 
 
 
 
 
 
 
51
  return (
52
  "πŸ’° For complete and up-to-date fee details for this program, we recommend visiting the official University of Education fee structure page.\n"
53
  "You’ll find comprehensive information regarding tuition, admission charges, and other applicable fees there.\n"
54
  "πŸ”— https://ue.edu.pk/allfeestructure.php"
55
  )
56
 
57
+ # πŸ” Continue with normal similarity-based logic
58
  user_embedding = similarity_model.encode(user_input_lower, convert_to_tensor=True)
59
  similarities = util.pytorch_cos_sim(user_embedding, dataset_embeddings)[0]
60
  best_match_idx = similarities.argmax().item()
61
  best_score = similarities[best_match_idx].item()
62
 
 
 
 
 
63
  if best_score >= 0.65:
64
  original_answer = dataset_answers[best_match_idx]
65
+ prompt = f"""As an official assistant for University of Education Lahore, provide a clear response:
66
+ Question: {user_input}
67
+ Original Answer: {original_answer}
68
+ Improved Answer:"""
 
 
 
 
 
 
 
 
 
 
69
  else:
70
+ prompt = f"""As an official assistant for University of Education Lahore, provide a helpful response:
71
+ Include relevant details about university policies.
72
+ If unsure, direct to official channels.
73
+ Question: {user_input}
74
+ Official Answer:"""
 
 
 
 
75
 
76
  llm_response = query_groq_llm(prompt)
77
 
78
  if llm_response:
79
+ for marker in ["Improved Answer:", "Official Answer:"]:
80
  if marker in llm_response:
81
+ response = llm_response.split(marker)[-1].strip()
82
+ break
83
+ else:
84
+ response = llm_response
85
  else:
86
+ response = dataset_answers[best_match_idx] if best_score >= 0.65 else """For official information:
87
+ πŸ“ž +92-42-99262231-33
88
+ βœ‰οΈ [email protected]
89
+ 🌐 ue.edu.pk"""
90
+
91
+ return response