Spaces:
Sleeping
Sleeping
Update utils.py
Browse files
utils.py
CHANGED
@@ -4,86 +4,73 @@ import requests
|
|
4 |
from sentence_transformers import SentenceTransformer, util
|
5 |
from dotenv import load_dotenv
|
6 |
|
7 |
-
|
8 |
-
# β
Load .env file
|
9 |
load_dotenv()
|
10 |
|
11 |
-
#
|
12 |
HF_API_TOKEN = os.getenv("HF_API_TOKEN")
|
13 |
-
|
14 |
if not HF_API_TOKEN:
|
15 |
-
raise ValueError("
|
16 |
-
|
17 |
-
# β
Hugging Face GPT Model API Endpoint
|
18 |
-
GPT_MODEL_API = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2"
|
19 |
|
20 |
-
#
|
|
|
21 |
headers = {"Authorization": f"Bearer {HF_API_TOKEN}"}
|
22 |
|
23 |
-
#
|
24 |
similarity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
|
25 |
|
26 |
-
#
|
27 |
-
with open("
|
28 |
dataset = json.load(f)
|
29 |
|
30 |
-
#
|
31 |
-
|
32 |
dataset_answers = [item.get("Answer", "") for item in dataset]
|
33 |
-
dataset_embeddings = similarity_model.encode(
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
# β
Find the best match and its similarity score
|
49 |
-
best_match_index = similarities.argmax().item()
|
50 |
-
best_score = similarities[best_match_index].item()
|
51 |
-
|
52 |
-
# β
Set a similarity threshold (tune as needed)
|
53 |
-
THRESHOLD = 0.65
|
54 |
-
|
55 |
-
if best_score < THRESHOLD:
|
56 |
-
return (
|
57 |
-
"I'm sorry, I couldn't find an exact answer to your question. "
|
58 |
-
"You may kindly try rephrasing your question gently for better results. "
|
59 |
-
"Also, feel free to visit the UOE official website for information: https://ue.edu.pk/"
|
60 |
-
)
|
61 |
-
|
62 |
-
best_answer = dataset_answers[best_match_index]
|
63 |
-
return rephrase_answer(best_answer)
|
64 |
-
|
65 |
-
# β
Function to rephrase answer using GPT
|
66 |
-
def rephrase_answer(answer):
|
67 |
-
prompt = (
|
68 |
-
f"Rephrase the following university-related answer while keeping the meaning unchanged:\n\n"
|
69 |
-
f"Original Answer: {answer}\n\n"
|
70 |
-
f"Rephrased Answer:"
|
71 |
-
)
|
72 |
-
|
73 |
-
payload = {"inputs": prompt} # β
Structured prompt to prevent extra details
|
74 |
-
|
75 |
-
response = requests.post(GPT_MODEL_API, headers=headers, json=payload)
|
76 |
-
|
77 |
-
if response.status_code == 200:
|
78 |
result = response.json()
|
79 |
-
if isinstance(result, list) and result
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
if "Rephrased Answer:" in generated_text:
|
84 |
-
return generated_text.split("Rephrased Answer:")[-1].strip()
|
85 |
-
return generated_text
|
86 |
-
|
87 |
-
return answer # β
Return original answer if API fails
|
88 |
-
|
89 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
from sentence_transformers import SentenceTransformer, util
|
5 |
from dotenv import load_dotenv
|
6 |
|
7 |
+
# Load environment variables
|
|
|
8 |
load_dotenv()
|
9 |
|
10 |
+
# API configurations
|
11 |
HF_API_TOKEN = os.getenv("HF_API_TOKEN")
|
|
|
12 |
if not HF_API_TOKEN:
|
13 |
+
raise ValueError("Hugging Face API Token is missing from .env file")
|
|
|
|
|
|
|
14 |
|
15 |
+
# DeepSeek API endpoint
|
16 |
+
DEEPSEEK_API = "https://api-inference.huggingface.co/models/deepseek-ai/deepseek-llm-7b"
|
17 |
headers = {"Authorization": f"Bearer {HF_API_TOKEN}"}
|
18 |
|
19 |
+
# Load models
|
20 |
similarity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
|
21 |
|
22 |
+
# Load dataset
|
23 |
+
with open("dataset.json") as f:
|
24 |
dataset = json.load(f)
|
25 |
|
26 |
+
# Precompute embeddings
|
27 |
+
dataset_questions = [item.get("Question", "").lower().strip() for item in dataset]
|
28 |
dataset_answers = [item.get("Answer", "") for item in dataset]
|
29 |
+
dataset_embeddings = similarity_model.encode(dataset_questions, convert_to_tensor=True)
|
30 |
+
|
31 |
+
def query_deepseek(prompt):
|
32 |
+
payload = {
|
33 |
+
"inputs": prompt,
|
34 |
+
"parameters": {
|
35 |
+
"max_new_tokens": 500,
|
36 |
+
"temperature": 0.7,
|
37 |
+
"do_sample": True
|
38 |
+
}
|
39 |
+
}
|
40 |
+
try:
|
41 |
+
response = requests.post(DEEPSEEK_API, headers=headers, json=payload)
|
42 |
+
response.raise_for_status()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
result = response.json()
|
44 |
+
return result[0].get("generated_text", "").strip() if isinstance(result, list) and result else ""
|
45 |
+
except Exception as e:
|
46 |
+
print(f"DeepSeek API error: {e}")
|
47 |
+
return ""
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
+
def get_best_answer(user_input):
|
50 |
+
# Find best match from dataset
|
51 |
+
user_embedding = similarity_model.encode(user_input.lower().strip(), convert_to_tensor=True)
|
52 |
+
similarities = util.pytorch_cos_sim(user_embedding, dataset_embeddings)[0]
|
53 |
+
best_match_idx = similarities.argmax().item()
|
54 |
+
best_score = similarities[best_match_idx].item()
|
55 |
+
|
56 |
+
if best_score >= 0.65: # Good match found
|
57 |
+
original_answer = dataset_answers[best_match_idx]
|
58 |
+
prompt = f"""Rephrase this university answer to be more helpful while keeping key information:
|
59 |
+
Question: {user_input}
|
60 |
+
Original Answer: {original_answer}
|
61 |
+
Improved Answer:"""
|
62 |
+
else: # No good match
|
63 |
+
prompt = f"""As a university assistant, provide a helpful response to:
|
64 |
+
Question: {user_input}
|
65 |
+
Answer:"""
|
66 |
+
|
67 |
+
deepseek_response = query_deepseek(prompt)
|
68 |
+
|
69 |
+
if deepseek_response:
|
70 |
+
for marker in ["Improved Answer:", "Answer:"]:
|
71 |
+
if marker in deepseek_response:
|
72 |
+
return deepseek_response.split(marker)[-1].strip()
|
73 |
+
return deepseek_response
|
74 |
+
|
75 |
+
return dataset_answers[best_match_idx] if best_score >= 0.65 else """I couldn't find specific information.
|
76 |
+
Please visit the UOE website: https://ue.edu.pk/"""
|