Spaces:
Sleeping
Sleeping
File size: 6,621 Bytes
8778311 2ed2129 8778311 2ed2129 ee7ea09 2ed2129 8778311 2ed2129 ee7ea09 2ed2129 ee7ea09 2ed2129 ee7ea09 2ed2129 ee7ea09 2ed2129 ee7ea09 2ed2129 ee7ea09 2ed2129 ee7ea09 2ed2129 ee7ea09 2ed2129 ee7ea09 2ed2129 56912a0 2ed2129 56912a0 2ed2129 56912a0 821284f 2ed2129 56912a0 2ed2129 56912a0 2ed2129 56912a0 2ed2129 56912a0 2ed2129 56912a0 2ed2129 56912a0 2ed2129 56912a0 2ed2129 56912a0 8778311 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 |
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
import faiss
import logging
from typing import List, Dict
from pathlib import Path
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class RAGSystem:
def __init__(self):
"""Initialize the RAG system"""
try:
self.model = SentenceTransformer('all-MiniLM-L6-v2')
self.embeddings = None
self.index = None
self.df = None
logger.info("RAG system initialized successfully")
except Exception as e:
logger.error(f"Error initializing RAG system: {str(e)}")
raise
def load_and_process_data(self, df: pd.DataFrame, cache_dir: Path = None):
"""Load and process the course data with caching support"""
try:
# Validate input
if df is None or len(df) == 0:
raise ValueError("Empty or None DataFrame provided")
required_columns = ['title', 'description', 'curriculum', 'url']
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
raise ValueError(f"Missing required columns: {missing_columns}")
self.df = df
vector_dimension = 384 # dimension for all-MiniLM-L6-v2
# Try loading from cache first
if cache_dir is not None:
cache_dir.mkdir(exist_ok=True)
embeddings_path = cache_dir / 'course_embeddings.npy'
index_path = cache_dir / 'faiss_index.bin'
if embeddings_path.exists() and index_path.exists():
logger.info("Loading cached embeddings and index...")
try:
self.embeddings = np.load(str(embeddings_path))
self.index = faiss.read_index(str(index_path))
logger.info("Successfully loaded cached data")
return
except Exception as e:
logger.warning(f"Failed to load cache: {e}. Computing new embeddings...")
# Compute new embeddings
logger.info("Computing course embeddings...")
texts = [
f"{row['title']}. {row['description']}"
for _, row in df.iterrows()
]
if not texts:
raise ValueError("No texts to encode")
self.embeddings = self.model.encode(
texts,
show_progress_bar=True,
convert_to_numpy=True
)
if self.embeddings.size == 0:
raise ValueError("Failed to generate embeddings")
# Create and populate FAISS index
self.index = faiss.IndexFlatL2(vector_dimension)
self.index.add(self.embeddings.astype('float32'))
# Save to cache if directory provided
if cache_dir is not None:
logger.info("Saving embeddings and index to cache...")
np.save(str(embeddings_path), self.embeddings)
faiss.write_index(self.index, str(index_path))
logger.info(f"Successfully processed {len(df)} courses")
except Exception as e:
logger.error(f"Error processing data: {str(e)}")
raise
def search_courses(self, query: str, top_k: int = 5) -> Dict:
"""Search for courses using semantic search with improved ranking"""
try:
# Ensure the FAISS index is initialized
if self.index is None:
raise ValueError("FAISS index not initialized. Please load data first.")
# Get query embedding
query_embedding = self.model.encode([query], convert_to_numpy=True)
# Get initial similarity scores
D, I = self.index.search(query_embedding.reshape(1, -1), top_k * 2)
distances = D[0]
indices = I[0]
# Get results with additional metadata
results = []
for dist, idx in zip(distances, indices):
course = self.df.iloc[idx].to_dict()
# Calculate relevance score components
title_similarity = self.calculate_text_similarity(query, course['title'])
desc_similarity = self.calculate_text_similarity(query, course['description'])
# Combine scores with weights
final_score = (
0.4 * (1 - dist) +
0.4 * title_similarity +
0.2 * desc_similarity
)
results.append({
**course,
'relevance_score': final_score
})
# Sort by final relevance score and take top_k
results.sort(key=lambda x: x['relevance_score'], reverse=True)
results = results[:top_k]
return {
'query': query,
'results': results
}
except Exception as e:
logger.error(f"Error in search_courses: {str(e)}")
raise
def calculate_text_similarity(self, text1: str, text2: str) -> float:
"""
Calculate text similarity between two strings using word overlap
Args:
text1 (str): First text string
text2 (str): Second text string
Returns:
float: Similarity score between 0 and 1
"""
try:
# Convert to lowercase and split into words
text1 = str(text1).lower()
text2 = str(text2).lower()
words1 = set(text1.split())
words2 = set(text2.split())
if not words1 or not words2:
return 0.0
# Calculate Jaccard similarity
intersection = len(words1.intersection(words2))
union = len(words1.union(words2))
if union == 0:
return 0.0
similarity = intersection / union
return similarity
except Exception as e:
logger.error(f"Error calculating text similarity: {str(e)}")
return 0.0 |