midrees2806 commited on
Commit
47214f6
Β·
verified Β·
1 Parent(s): 462de64

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +56 -69
utils.py CHANGED
@@ -4,86 +4,73 @@ import requests
4
  from sentence_transformers import SentenceTransformer, util
5
  from dotenv import load_dotenv
6
 
7
-
8
- # βœ… Load .env file
9
  load_dotenv()
10
 
11
- # βœ… Hugging Face API Token check
12
  HF_API_TOKEN = os.getenv("HF_API_TOKEN")
13
-
14
  if not HF_API_TOKEN:
15
- raise ValueError("Error: Hugging Face API Token is missing! Please check your .env file.")
16
-
17
- # βœ… Hugging Face GPT Model API Endpoint
18
- GPT_MODEL_API = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2"
19
 
20
- # βœ… Headers for API request
 
21
  headers = {"Authorization": f"Bearer {HF_API_TOKEN}"}
22
 
23
- # βœ… Load sentence transformer model for intent matching
24
  similarity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
25
 
26
- # βœ… Load dataset
27
- with open("ds.json") as f:
28
  dataset = json.load(f)
29
 
30
- # βœ… Precompute embeddings for dataset questions
31
- dataset_inputs = [item.get("Question", "").lower().strip() for item in dataset]
32
  dataset_answers = [item.get("Answer", "") for item in dataset]
33
- dataset_embeddings = similarity_model.encode(dataset_inputs, convert_to_tensor=True)
34
-
35
- # βœ… Function to detect greetings
36
- def is_greeting(text):
37
- greetings = ["hello", "hi", "hey", "salam", "assalam o alaikum", "assalamu alaikum", "assalamualaikum"]
38
- return text.lower().strip() in greetings
39
-
40
- # βœ… Function to get the best matching answer
41
- def get_best_answer(user_input):
42
- if is_greeting(user_input):
43
- return "Hello! 😊 How can I assist you today with university-related information?"
44
-
45
- user_input_embedding = similarity_model.encode(user_input.lower().strip(), convert_to_tensor=True)
46
- similarities = util.pytorch_cos_sim(user_input_embedding, dataset_embeddings)[0]
47
-
48
- # βœ… Find the best match and its similarity score
49
- best_match_index = similarities.argmax().item()
50
- best_score = similarities[best_match_index].item()
51
-
52
- # βœ… Set a similarity threshold (tune as needed)
53
- THRESHOLD = 0.65
54
-
55
- if best_score < THRESHOLD:
56
- return (
57
- "I'm sorry, I couldn't find an exact answer to your question. "
58
- "You may kindly try rephrasing your question gently for better results. "
59
- "Also, feel free to visit the UOE official website for information: https://ue.edu.pk/"
60
- )
61
-
62
- best_answer = dataset_answers[best_match_index]
63
- return rephrase_answer(best_answer)
64
-
65
- # βœ… Function to rephrase answer using GPT
66
- def rephrase_answer(answer):
67
- prompt = (
68
- f"Rephrase the following university-related answer while keeping the meaning unchanged:\n\n"
69
- f"Original Answer: {answer}\n\n"
70
- f"Rephrased Answer:"
71
- )
72
-
73
- payload = {"inputs": prompt} # βœ… Structured prompt to prevent extra details
74
-
75
- response = requests.post(GPT_MODEL_API, headers=headers, json=payload)
76
-
77
- if response.status_code == 200:
78
  result = response.json()
79
- if isinstance(result, list) and result:
80
- generated_text = result[0].get("generated_text", answer).strip()
81
-
82
- # βœ… Ensure only the rephrased answer is returned
83
- if "Rephrased Answer:" in generated_text:
84
- return generated_text.split("Rephrased Answer:")[-1].strip()
85
- return generated_text
86
-
87
- return answer # βœ… Return original answer if API fails
88
-
89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  from sentence_transformers import SentenceTransformer, util
5
  from dotenv import load_dotenv
6
 
7
+ # Load environment variables
 
8
  load_dotenv()
9
 
10
+ # API configurations
11
  HF_API_TOKEN = os.getenv("HF_API_TOKEN")
 
12
  if not HF_API_TOKEN:
13
+ raise ValueError("Hugging Face API Token is missing from .env file")
 
 
 
14
 
15
+ # DeepSeek API endpoint
16
+ DEEPSEEK_API = "https://api-inference.huggingface.co/models/deepseek-ai/deepseek-llm-7b"
17
  headers = {"Authorization": f"Bearer {HF_API_TOKEN}"}
18
 
19
+ # Load models
20
  similarity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
21
 
22
+ # Load dataset
23
+ with open("dataset.json") as f:
24
  dataset = json.load(f)
25
 
26
+ # Precompute embeddings
27
+ dataset_questions = [item.get("Question", "").lower().strip() for item in dataset]
28
  dataset_answers = [item.get("Answer", "") for item in dataset]
29
+ dataset_embeddings = similarity_model.encode(dataset_questions, convert_to_tensor=True)
30
+
31
+ def query_deepseek(prompt):
32
+ payload = {
33
+ "inputs": prompt,
34
+ "parameters": {
35
+ "max_new_tokens": 500,
36
+ "temperature": 0.7,
37
+ "do_sample": True
38
+ }
39
+ }
40
+ try:
41
+ response = requests.post(DEEPSEEK_API, headers=headers, json=payload)
42
+ response.raise_for_status()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  result = response.json()
44
+ return result[0].get("generated_text", "").strip() if isinstance(result, list) and result else ""
45
+ except Exception as e:
46
+ print(f"DeepSeek API error: {e}")
47
+ return ""
 
 
 
 
 
 
48
 
49
+ def get_best_answer(user_input):
50
+ # Find best match from dataset
51
+ user_embedding = similarity_model.encode(user_input.lower().strip(), convert_to_tensor=True)
52
+ similarities = util.pytorch_cos_sim(user_embedding, dataset_embeddings)[0]
53
+ best_match_idx = similarities.argmax().item()
54
+ best_score = similarities[best_match_idx].item()
55
+
56
+ if best_score >= 0.65: # Good match found
57
+ original_answer = dataset_answers[best_match_idx]
58
+ prompt = f"""Rephrase this university answer to be more helpful while keeping key information:
59
+ Question: {user_input}
60
+ Original Answer: {original_answer}
61
+ Improved Answer:"""
62
+ else: # No good match
63
+ prompt = f"""As a university assistant, provide a helpful response to:
64
+ Question: {user_input}
65
+ Answer:"""
66
+
67
+ deepseek_response = query_deepseek(prompt)
68
+
69
+ if deepseek_response:
70
+ for marker in ["Improved Answer:", "Answer:"]:
71
+ if marker in deepseek_response:
72
+ return deepseek_response.split(marker)[-1].strip()
73
+ return deepseek_response
74
+
75
+ return dataset_answers[best_match_idx] if best_score >= 0.65 else """I couldn't find specific information.
76
+ Please visit the UOE website: https://ue.edu.pk/"""