File size: 2,730 Bytes
d1ba382
 
 
 
 
 
47214f6
d1ba382
 
47214f6
d1ba382
 
47214f6
d1ba382
47214f6
 
d1ba382
 
47214f6
d1ba382
 
47214f6
ebf68db
d1ba382
 
47214f6
 
462de64
47214f6
 
 
 
 
 
 
 
 
 
 
 
 
 
d1ba382
47214f6
 
 
 
d1ba382
47214f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import os
import json
import requests
from sentence_transformers import SentenceTransformer, util
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# API configurations
HF_API_TOKEN = os.getenv("HF_API_TOKEN")
if not HF_API_TOKEN:
    raise ValueError("Hugging Face API Token is missing from .env file")

# DeepSeek API endpoint
DEEPSEEK_API = "https://api-inference.huggingface.co/models/deepseek-ai/deepseek-llm-7b"
headers = {"Authorization": f"Bearer {HF_API_TOKEN}"}

# Load models
similarity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Load dataset
with open("ds.json") as f:
    dataset = json.load(f)

# Precompute embeddings
dataset_questions = [item.get("Question", "").lower().strip() for item in dataset]
dataset_answers = [item.get("Answer", "") for item in dataset]
dataset_embeddings = similarity_model.encode(dataset_questions, convert_to_tensor=True)

def query_deepseek(prompt):
    payload = {
        "inputs": prompt,
        "parameters": {
            "max_new_tokens": 500,
            "temperature": 0.7,
            "do_sample": True
        }
    }
    try:
        response = requests.post(DEEPSEEK_API, headers=headers, json=payload)
        response.raise_for_status()
        result = response.json()
        return result[0].get("generated_text", "").strip() if isinstance(result, list) and result else ""
    except Exception as e:
        print(f"DeepSeek API error: {e}")
        return ""

def get_best_answer(user_input):
    # Find best match from dataset
    user_embedding = similarity_model.encode(user_input.lower().strip(), convert_to_tensor=True)
    similarities = util.pytorch_cos_sim(user_embedding, dataset_embeddings)[0]
    best_match_idx = similarities.argmax().item()
    best_score = similarities[best_match_idx].item()

    if best_score >= 0.65:  # Good match found
        original_answer = dataset_answers[best_match_idx]
        prompt = f"""Rephrase this university answer to be more helpful while keeping key information:
        Question: {user_input}
        Original Answer: {original_answer}
        Improved Answer:"""
    else:  # No good match
        prompt = f"""As a university assistant, provide a helpful response to:
        Question: {user_input}
        Answer:"""

    deepseek_response = query_deepseek(prompt)
    
    if deepseek_response:
        for marker in ["Improved Answer:", "Answer:"]:
            if marker in deepseek_response:
                return deepseek_response.split(marker)[-1].strip()
        return deepseek_response
    
    return dataset_answers[best_match_idx] if best_score >= 0.65 else """I couldn't find specific information. 
    Please visit the UOE website: https://ue.edu.pk/"""