import os import json import requests from sentence_transformers import SentenceTransformer, util from dotenv import load_dotenv # Load environment variables load_dotenv() # API configurations HF_API_TOKEN = os.getenv("HF_API_TOKEN") if not HF_API_TOKEN: raise ValueError("Hugging Face API Token is missing from .env file") # DeepSeek API endpoint DEEPSEEK_API = "https://api-inference.huggingface.co/models/deepseek-ai/deepseek-llm-7b" headers = {"Authorization": f"Bearer {HF_API_TOKEN}"} # Load models similarity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2') # Load dataset with open("ds.json") as f: dataset = json.load(f) # Precompute embeddings dataset_questions = [item.get("Question", "").lower().strip() for item in dataset] dataset_answers = [item.get("Answer", "") for item in dataset] dataset_embeddings = similarity_model.encode(dataset_questions, convert_to_tensor=True) def query_deepseek(prompt): payload = { "inputs": prompt, "parameters": { "max_new_tokens": 500, "temperature": 0.7, "do_sample": True } } try: response = requests.post(DEEPSEEK_API, headers=headers, json=payload) response.raise_for_status() result = response.json() return result[0].get("generated_text", "").strip() if isinstance(result, list) and result else "" except Exception as e: print(f"DeepSeek API error: {e}") return "" def get_best_answer(user_input): # Find best match from dataset user_embedding = similarity_model.encode(user_input.lower().strip(), convert_to_tensor=True) similarities = util.pytorch_cos_sim(user_embedding, dataset_embeddings)[0] best_match_idx = similarities.argmax().item() best_score = similarities[best_match_idx].item() if best_score >= 0.65: # Good match found original_answer = dataset_answers[best_match_idx] prompt = f"""Rephrase this university answer to be more helpful while keeping key information: Question: {user_input} Original Answer: {original_answer} Improved Answer:""" else: # No good match prompt = f"""As a university assistant, provide a helpful response to: Question: {user_input} Answer:""" deepseek_response = query_deepseek(prompt) if deepseek_response: for marker in ["Improved Answer:", "Answer:"]: if marker in deepseek_response: return deepseek_response.split(marker)[-1].strip() return deepseek_response return dataset_answers[best_match_idx] if best_score >= 0.65 else """I couldn't find specific information. Please visit the UOE website: https://ue.edu.pk/"""