Spaces:
Sleeping
Sleeping
File size: 12,330 Bytes
6cc218a 5a01603 3d9d5c0 5a01603 6cc218a 5a01603 6cc218a d46117e 5a01603 6cc218a 5a01603 d46117e 5a01603 6cc218a 5a01603 6cc218a 5a01603 6cc218a 5a01603 6cc218a 5a01603 4e7ca76 5a01603 c3e6470 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 |
# back.py
import os
import logging
from typing import List, Dict, Any, Optional, Union
from dataclasses import dataclass
import torch
from sentence_transformers import SentenceTransformer
from langchain_community.vectorstores import FAISS
from langchain_core.embeddings import Embeddings
import google.generativeai as genai
from datetime import datetime
import json
import pickle
@dataclass
class UserInfo:
"""User information for context"""
name: str
college: str
degree: str
year: int
career_goals: str
has_internship: bool
has_placement: bool
@dataclass
class ChatConfig:
"""Configuration for the chatbot"""
embedding_model_name: str = 'all-MiniLM-L6-v2'
device: str = 'cuda' if torch.cuda.is_available() else 'cpu'
max_history: int = 6
gemini_api_key: str = os.getenv("GEMINI_API") # Replace with your API key
log_file: str = "chat_history.txt"
user_data_file: str = "user_data.json"
database_file: str = "faiss_db.pkl" # Added database file path
# In the UserManager class, modify these methods:
class UserManager:
"""Manages user information storage and retrieval"""
def __init__(self, user_data_file: str):
self.user_data_file = user_data_file
self.ensure_file_exists()
def ensure_file_exists(self):
"""Create user data file if it doesn't exist"""
if not os.path.exists(self.user_data_file):
os.makedirs(os.path.dirname(self.user_data_file), exist_ok=True)
with open(self.user_data_file, 'w', encoding='utf-8') as f:
json.dump({}, f)
def save_user_info(self, user_info: UserInfo):
"""Save user information to JSON file"""
try:
# First ensure the file exists with valid JSON
self.ensure_file_exists()
# Read existing data
try:
with open(self.user_data_file, 'r', encoding='utf-8') as f:
data = json.load(f)
except json.JSONDecodeError:
data = {}
# Update data
data[user_info.name] = {
"college": user_info.college,
"degree": user_info.degree,
"year": user_info.year,
"career_goals": user_info.career_goals,
"has_internship": user_info.has_internship,
"has_placement": user_info.has_placement,
"last_updated": datetime.now().isoformat()
}
# Write back to file
with open(self.user_data_file, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=4)
return True
except Exception as e:
logging.error(f"Error saving user info: {str(e)}")
return False
class ChatLogger:
"""Logger for chat interactions"""
def __init__(self, log_file: str):
self.log_file = log_file
def log_interaction(self, question: str, answer: str, user_info: Optional[UserInfo] = None):
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
with open(self.log_file, 'a', encoding='utf-8') as f:
user_context = ""
if user_info:
user_context = f"\nUser: {user_info.name} | College: {user_info.college} | Degree: {user_info.degree} | Year: {user_info.year} | Career Goals: {user_info.career_goals}"
f.write(f"\n[{timestamp}]{user_context}\nQ: {question}\nA: {answer}\n{'-'*50}")
class ChatMemory:
"""Manages chat history"""
def __init__(self, max_history: int = 10):
self.max_history = max_history
self.history = []
def add_interaction(self, question: str, answer: str):
self.history.append({"question": question, "answer": answer})
if len(self.history) > self.max_history:
self.history.pop(0)
def get_history(self) -> List[Dict[str, str]]:
return self.history
def clear_history(self):
self.history = []
class QuestionGenerator:
def __init__(self, api_key: str):
genai.configure(api_key=api_key)
self.generation_config = {
"temperature": 0.1,
"max_output_tokens": 8192,
}
self.model = genai.GenerativeModel(
model_name="gemini-1.5-flash",
generation_config=self.generation_config,
safety_settings={'HATE': 'BLOCK_NONE','HARASSMENT': 'BLOCK_NONE','SEXUAL' : 'BLOCK_NONE','DANGEROUS' : 'BLOCK_NONE'}
)
self.default_questions = [
"What are some other skills I should focus on to improve my chances?",
"What resources or platforms can help me in my career journey?",
"Are there any specific companies or organizations I should target for internships/placements?",
"What are some common interview questions asked for this career path?"
]
async def generate_questions(
self,
question: str,
answer: str,
user_info: Optional[UserInfo] = None
) -> List[str]:
"""Generate follow-up questions based on the conversation"""
try:
chat = self.model.start_chat(history=[])
prompt = f"""Generate 4 simple, practical follow-up questions, that a college student may ask, based on this conversation about career advice:
Question: {question}
Answer: {answer}
Focus the questions on:
1. Skills development (What skills are needed, how to improve)
2. Resources and platforms (Where to find internships, jobs, etc.)
3. Specific target companies/organizations
4. Common interview questions
Keep the language simple and student-friendly. Format each question on a new line.
NOTE: YOU MUST STRICTLY REPLY IN HINGLISH"""
response = chat.send_message(prompt).text
# Extract questions
questions = [q.strip() for q in response.split('\n') if q.strip()]
# Return default questions if we don't get exactly 4 valid questions
if len(questions) != 4:
return self.default_questions
return questions
except Exception as e:
logging.error(f"Error generating questions: {str(e)}")
return self.default_questions
class GeminiRAG:
def __init__(self, api_key: str):
genai.configure(api_key=api_key)
self.generation_config = {
"temperature": 0.1,
"max_output_tokens": 8192,
}
self.model = genai.GenerativeModel(
model_name="gemini-2.0-flash-exp",
generation_config=self.generation_config,
safety_settings={'HATE': 'BLOCK_NONE','HARASSMENT': 'BLOCK_NONE','SEXUAL' : 'BLOCK_NONE','DANGEROUS' : 'BLOCK_NONE'}
)
def create_context(self, relevant_docs: List[Dict[str, Any]]) -> str:
"""Creates a context string from relevant documents"""
context_parts = []
for doc in relevant_docs:
context_parts.append(f"Section: {doc['metadata']['section']}\n{doc['content']}")
return "\n\n".join(context_parts)
async def get_answer(
self,
question: str,
context: str,
user_info: Optional[UserInfo] = None
) -> str:
try:
chat = self.model.start_chat(history=[])
# Simplified prompt to reduce chances of recitation
prompt = f"""As a career counselor, provide a helpful response based on:
Context: {context}
{f'''User Background:
- Student at {user_info.college}
- Studying {user_info.degree} (Year {user_info.year})
- Goals: {user_info.career_goals}
- {'Has internship experience' if user_info.has_internship else 'No internship yet'}
- {'Has placement' if user_info.has_placement else 'Seeking placement'}''' if user_info else ''}
Question: {question}
Provide practical advice with specific examples and actionable steps."""
try:
response = chat.send_message(prompt)
if response.text:
return response.text
else:
return "I apologize, but I couldn't generate a proper response. Please try rephrasing your question."
except Exception as chat_error:
logging.error(f"Chat error: {str(chat_error)}")
return "I encountered an error while processing your question. Please try again with a simpler question."
except Exception as e:
logging.error(f"Error generating answer: {str(e)}")
return "An error occurred. Please try again later."
class CustomEmbeddings(Embeddings):
"""Custom embeddings using SentenceTransformer"""
def __init__(self, model_name: str, device: str):
self.model = SentenceTransformer(model_name)
self.model.to(device)
def embed_documents(self, texts: List[str]) -> List[List[float]]:
with torch.no_grad():
embeddings = self.model.encode(texts, convert_to_tensor=True)
return embeddings.cpu().numpy().tolist()
def embed_query(self, text: str) -> List[float]:
with torch.no_grad():
embedding = self.model.encode([text], convert_to_tensor=True)
return embedding.cpu().numpy().tolist()[0]
class ProductDatabase:
"""Handles document storage and retrieval"""
def __init__(self, config: ChatConfig):
self.embeddings = CustomEmbeddings(
model_name=config.embedding_model_name,
device=config.device
)
self.vectorstore = None
self.config = config
self.load_database()
def load_database(self):
"""Loads the FAISS database from file"""
try:
if os.path.exists(self.config.database_file):
with open(self.config.database_file, "rb") as f:
self.vectorstore = pickle.load(f)
print("Database loaded successfully from file.")
else:
print("Database file not found. Please run setup.py to create it.")
except Exception as e:
logging.error(f"Error loading database: {str(e)}")
print(f"Error loading database: {str(e)}")
self.vectorstore = None
def process_markdown(self, markdown_content: str):
"""Process markdown content and create vector store"""
try:
sections = markdown_content.split('\n## ')
documents = []
if sections[0].startswith('# '):
intro = sections[0].split('\n', 1)[1]
documents.append({
"content": intro,
"section": "Introduction"
})
for section in sections[1:]:
if section.strip():
title, content = section.split('\n', 1)
documents.append({
"content": content.strip(),
"section": title.strip()
})
texts = [doc["content"] for doc in documents]
metadatas = [{"section": doc["section"]} for doc in documents]
if self.vectorstore is None:
self.vectorstore = FAISS.from_texts(
texts=texts,
embedding=self.embeddings,
metadatas=metadatas
)
else:
self.vectorstore.add_texts(texts=texts, metadatas=metadatas, embedding=self.embeddings)
except Exception as e:
raise Exception(f"Error processing markdown content: {str(e)}")
def search(self, query: str, k: int = 5) -> List[Dict[str, Any]]:
"""Search for relevant documents"""
if not self.vectorstore:
raise ValueError("Database not initialized. Please process documents first.")
try:
docs = self.vectorstore.similarity_search(query, k=k)
return [{"content": doc.page_content, "metadata": doc.metadata} for doc in docs]
except Exception as e:
logging.error(f"Error during search: {str(e)}")
return [] |